summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks13
-rw-r--r--kernel/Makefile102
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c49
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/audit_tree.c92
-rw-r--r--kernel/audit_watch.c14
-rw-r--r--kernel/auditsc.c18
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c194
-rw-r--r--kernel/bpf/core.c130
-rw-r--r--kernel/bpf/hashtab.c6
-rw-r--r--kernel/bpf/helpers.c135
-rw-r--r--kernel/bpf/syscall.c60
-rw-r--r--kernel/bpf/test_stub.c78
-rw-r--r--kernel/bpf/verifier.c255
-rw-r--r--kernel/capability.c35
-rw-r--r--kernel/cgroup.c426
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c355
-rw-r--r--kernel/compat.c6
-rw-r--r--kernel/configs/xen.config48
-rw-r--r--kernel/context_tracking.c126
-rw-r--r--kernel/cpu.c110
-rw-r--r--kernel/cpuset.c33
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/events/core.c1244
-rw-r--r--kernel/events/hw_breakpoint.c8
-rw-r--r--kernel/events/internal.h50
-rw-r--r--kernel/events/ring_buffer.c371
-rw-r--r--kernel/events/uprobes.c228
-rw-r--r--kernel/exec_domain.c137
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c275
-rw-r--r--kernel/futex.c174
-rw-r--r--kernel/gcov/base.c11
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/groups.c3
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/chip.c142
-rw-r--r--kernel/irq/devres.c4
-rw-r--r--kernel/irq/dummychip.c2
-rw-r--r--kernel/irq/generic-chip.c11
-rw-r--r--kernel/irq/handle.c4
-rw-r--r--kernel/irq/internals.h32
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/irq/irqdomain.c43
-rw-r--r--kernel/irq/manage.c224
-rw-r--r--kernel/irq/migration.c15
-rw-r--r--kernel/irq/msi.c30
-rw-r--r--kernel/irq/pm.c16
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/resend.c22
-rw-r--r--kernel/irq/spurious.c26
-rw-r--r--kernel/jump_label.c168
-rw-r--r--kernel/kexec.c13
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/kthread.c31
-rw-r--r--kernel/livepatch/core.c161
-rw-r--r--kernel/locking/Makefile7
-rw-r--r--kernel/locking/lglock.c22
-rw-r--r--kernel/locking/lockdep.c199
-rw-r--r--kernel/locking/lockdep_proc.c22
-rw-r--r--kernel/locking/locktorture.c14
-rw-r--r--kernel/locking/mcs_spinlock.h7
-rw-r--r--kernel/locking/mutex.c51
-rw-r--r--kernel/locking/osq_lock.c14
-rw-r--r--kernel/locking/percpu-rwsem.c13
-rw-r--r--kernel/locking/qrwlock.c75
-rw-r--r--kernel/locking/qspinlock.c473
-rw-r--r--kernel/locking/qspinlock_paravirt.h378
-rw-r--r--kernel/locking/rtmutex-tester.c420
-rw-r--r--kernel/locking/rtmutex.c121
-rw-r--r--kernel/locking/rtmutex_common.h25
-rw-r--r--kernel/locking/rwsem-spinlock.c7
-rw-r--r--kernel/locking/rwsem-xadd.c142
-rw-r--r--kernel/locking/rwsem.c22
-rw-r--r--kernel/locking/rwsem.h20
-rw-r--r--kernel/module.c351
-rw-r--r--kernel/module_signing.c213
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c131
-rw-r--r--kernel/pid.c20
-rw-r--r--kernel/power/Kconfig12
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/hibernate.c4
-rw-r--r--kernel/power/main.c4
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/power/suspend.c23
-rw-r--r--kernel/power/swap.c155
-rw-r--r--kernel/power/wakelock.c18
-rw-r--r--kernel/printk/printk.c298
-rw-r--r--kernel/ptrace.c52
-rw-r--r--kernel/rcu/rcutorture.c168
-rw-r--r--kernel/rcu/srcu.c44
-rw-r--r--kernel/rcu/tiny.c62
-rw-r--r--kernel/rcu/tiny_plugin.h12
-rw-r--r--kernel/rcu/tree.c1427
-rw-r--r--kernel/rcu/tree.h140
-rw-r--r--kernel/rcu/tree_plugin.h591
-rw-r--r--kernel/rcu/tree_trace.c29
-rw-r--r--kernel/rcu/update.c192
-rw-r--r--kernel/reboot.c53
-rw-r--r--kernel/relay.c9
-rw-r--r--kernel/resource.c38
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/auto_group.h2
-rw-r--r--kernel/sched/core.c975
-rw-r--r--kernel/sched/cputime.c101
-rw-r--r--kernel/sched/deadline.c392
-rw-r--r--kernel/sched/debug.c95
-rw-r--r--kernel/sched/fair.c1569
-rw-r--r--kernel/sched/features.h31
-rw-r--r--kernel/sched/idle.c146
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/loadavg.c (renamed from kernel/sched/proc.c)236
-rw-r--r--kernel/sched/rt.c329
-rw-r--r--kernel/sched/sched.h123
-rw-r--r--kernel/sched/stats.h19
-rw-r--r--kernel/sched/stop_task.c1
-rw-r--r--kernel/sched/wait.c15
-rw-r--r--kernel/seccomp.c87
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smp.c80
-rw-r--r--kernel/smpboot.c223
-rw-r--r--kernel/stop_machine.c86
-rw-r--r--kernel/sys.c218
-rw-r--r--kernel/sys_ni.c16
-rw-r--r--kernel/sysctl.c94
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c106
-rw-r--r--kernel/task_work.c12
-rw-r--r--kernel/time/Kconfig8
-rw-r--r--kernel/time/Makefile23
-rw-r--r--kernel/time/alarmtimer.c17
-rw-r--r--kernel/time/clockevents.c242
-rw-r--r--kernel/time/clocksource.c187
-rw-r--r--kernel/time/hrtimer.c740
-rw-r--r--kernel/time/jiffies.c7
-rw-r--r--kernel/time/ntp.c80
-rw-r--r--kernel/time/ntp_internal.h1
-rw-r--r--kernel/time/posix-cpu-timers.c87
-rw-r--r--kernel/time/posix-timers.c17
-rw-r--r--kernel/time/sched_clock.c236
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c55
-rw-r--r--kernel/time/tick-broadcast.c392
-rw-r--r--kernel/time/tick-common.c124
-rw-r--r--kernel/time/tick-internal.h230
-rw-r--r--kernel/time/tick-oneshot.c22
-rw-r--r--kernel/time/tick-sched.c397
-rw-r--r--kernel/time/tick-sched.h84
-rw-r--r--kernel/time/time.c131
-rw-r--r--kernel/time/timeconst.bc3
-rw-r--r--kernel/time/timekeeping.c621
-rw-r--r--kernel/time/timekeeping.h18
-rw-r--r--kernel/time/timer.c436
-rw-r--r--kernel/time/timer_list.c87
-rw-r--r--kernel/time/timer_stats.c10
-rw-r--r--kernel/torture.c26
-rw-r--r--kernel/trace/Kconfig36
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c20
-rw-r--r--kernel/trace/bpf_trace.c283
-rw-r--r--kernel/trace/ftrace.c96
-rw-r--r--kernel/trace/ring_buffer.c225
-rw-r--r--kernel/trace/ring_buffer_benchmark.c25
-rw-r--r--kernel/trace/trace.c516
-rw-r--r--kernel/trace/trace.h47
-rw-r--r--kernel/trace/trace_branch.c21
-rw-r--r--kernel/trace/trace_clock.c3
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_event_perf.c20
-rw-r--r--kernel/trace/trace_events.c472
-rw-r--r--kernel/trace/trace_events_filter.c110
-rw-r--r--kernel/trace/trace_events_trigger.c70
-rw-r--r--kernel/trace/trace_export.c12
-rw-r--r--kernel/trace/trace_functions_graph.c23
-rw-r--r--kernel/trace/trace_kprobe.c115
-rw-r--r--kernel/trace/trace_mmiotrace.c4
-rw-r--r--kernel/trace/trace_output.c79
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_probe.c19
-rw-r--r--kernel/trace/trace_probe.h20
-rw-r--r--kernel/trace/trace_sched_switch.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c6
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_stat.c10
-rw-r--r--kernel/trace/trace_syscalls.c72
-rw-r--r--kernel/trace/trace_uprobe.c85
-rw-r--r--kernel/user_namespace.c5
-rw-r--r--kernel/watchdog.c475
-rw-r--r--kernel/workqueue.c1332
195 files changed, 15865 insertions, 9641 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 08561f1acd13..ebdb0043203a 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER
def_bool y
depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
-config ARCH_USE_QUEUE_RWLOCK
+config ARCH_USE_QUEUED_SPINLOCKS
bool
-config QUEUE_RWLOCK
- def_bool y if ARCH_USE_QUEUE_RWLOCK
+config QUEUED_SPINLOCKS
+ def_bool y if ARCH_USE_QUEUED_SPINLOCKS
+ depends on SMP
+
+config ARCH_USE_QUEUED_RWLOCKS
+ bool
+
+config QUEUED_RWLOCKS
+ def_bool y if ARCH_USE_QUEUED_RWLOCKS
depends on SMP
diff --git a/kernel/Makefile b/kernel/Makefile
index d7657f5535c9..d25ebea0453a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,9 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o groups.o smpboot.o
+ async.o range.o smpboot.o
+
+obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
@@ -43,7 +45,6 @@ ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -53,6 +54,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
@@ -109,99 +111,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
-
-###############################################################################
-#
-# Roll all the X.509 certificates that we can find together and pull them into
-# the kernel so that they get loaded into the system trusted keyring during
-# boot.
-#
-# We look in the source root and the build root for all files whose name ends
-# in ".x509". Unfortunately, this will generate duplicate filenames, so we
-# have make canonicalise the pathnames and then sort them to discard the
-# duplicates.
-#
-###############################################################################
-ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
-X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
-X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
-X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
- $(or $(realpath $(CERT)),$(CERT))))
-X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
-
-ifeq ($(X509_CERTIFICATES),)
-$(warning *** No X.509 certificates found ***)
-endif
-
-ifneq ($(wildcard $(obj)/.x509.list),)
-ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
-$(info X.509 certificate list changed)
-$(shell rm $(obj)/.x509.list)
-endif
-endif
-
-kernel/system_certificates.o: $(obj)/x509_certificate_list
-
-quiet_cmd_x509certs = CERTS $@
- cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
-
-targets += $(obj)/x509_certificate_list
-$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
- $(call if_changed,x509certs)
-
-targets += $(obj)/.x509.list
-$(obj)/.x509.list:
- @echo $(X509_CERTIFICATES) >$@
-endif
-
-clean-files := x509_certificate_list .x509.list
-
-ifeq ($(CONFIG_MODULE_SIG),y)
-###############################################################################
-#
-# If module signing is requested, say by allyesconfig, but a key has not been
-# supplied, then one will need to be generated to make sure the build does not
-# fail and that the kernel may be used afterwards.
-#
-###############################################################################
-ifndef CONFIG_MODULE_SIG_HASH
-$(error Could not determine digest type to use from kernel config)
-endif
-
-signing_key.priv signing_key.x509: x509.genkey
- @echo "###"
- @echo "### Now generating an X.509 key pair to be used for signing modules."
- @echo "###"
- @echo "### If this takes a long time, you might wish to run rngd in the"
- @echo "### background to keep the supply of entropy topped up. It"
- @echo "### needs to be run as root, and uses a hardware random"
- @echo "### number generator if one is available."
- @echo "###"
- openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
- -batch -x509 -config x509.genkey \
- -outform DER -out signing_key.x509 \
- -keyout signing_key.priv 2>&1
- @echo "###"
- @echo "### Key pair generated."
- @echo "###"
-
-x509.genkey:
- @echo Generating X.509 key generation config
- @echo >x509.genkey "[ req ]"
- @echo >>x509.genkey "default_bits = 4096"
- @echo >>x509.genkey "distinguished_name = req_distinguished_name"
- @echo >>x509.genkey "prompt = no"
- @echo >>x509.genkey "string_mask = utf8only"
- @echo >>x509.genkey "x509_extensions = myexts"
- @echo >>x509.genkey
- @echo >>x509.genkey "[ req_distinguished_name ]"
- @echo >>x509.genkey "O = Magrathea"
- @echo >>x509.genkey "CN = Glacier signing key"
- @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
- @echo >>x509.genkey
- @echo >>x509.genkey "[ myexts ]"
- @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
- @echo >>x509.genkey "keyUsage=digitalSignature"
- @echo >>x509.genkey "subjectKeyIdentifier=hash"
- @echo >>x509.genkey "authorityKeyIdentifier=keyid"
-endif
diff --git a/kernel/acct.c b/kernel/acct.c
index e6c10d1a4058..74963d192c5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -213,7 +213,7 @@ static int acct_on(struct filename *pathname)
return -EACCES;
}
- if (!file->f_op->write) {
+ if (!(file->f_mode & FMODE_CAN_WRITE)) {
kfree(acct);
filp_close(file, NULL);
return -EIO;
diff --git a/kernel/audit.c b/kernel/audit.c
index 060153dc47d4..662c007635fb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -43,6 +43,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/file.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/atomic.h>
@@ -107,6 +108,7 @@ static u32 audit_rate_limit;
* When set to zero, this means unlimited. */
static u32 audit_backlog_limit = 64;
#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
+static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
static u32 audit_backlog_wait_overflow = 0;
@@ -338,13 +340,13 @@ static int audit_set_backlog_limit(u32 limit)
static int audit_set_backlog_wait_time(u32 timeout)
{
return audit_do_config_change("audit_backlog_wait_time",
- &audit_backlog_wait_time, timeout);
+ &audit_backlog_wait_time_master, timeout);
}
static int audit_set_enabled(u32 state)
{
int rc;
- if (state < AUDIT_OFF || state > AUDIT_LOCKED)
+ if (state > AUDIT_LOCKED)
return -EINVAL;
rc = audit_do_config_change("audit_enabled", &audit_enabled, state);
@@ -663,7 +665,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
case AUDIT_MAKE_EQUIV:
/* Only support auditd and auditctl in initial pid namespace
* for now. */
- if ((task_active_pid_ns(current) != &init_pid_ns))
+ if (task_active_pid_ns(current) != &init_pid_ns)
return -EPERM;
if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
@@ -834,7 +836,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.lost = atomic_read(&audit_lost);
s.backlog = skb_queue_len(&audit_skb_queue);
s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
- s.backlog_wait_time = audit_backlog_wait_time;
+ s.backlog_wait_time = audit_backlog_wait_time_master;
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
@@ -877,8 +879,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
if (sizeof(s) > (size_t)nlh->nlmsg_len)
return -EINVAL;
- if (s.backlog_wait_time < 0 ||
- s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
+ if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
return -EINVAL;
err = audit_set_backlog_wait_time(s.backlog_wait_time);
if (err < 0)
@@ -1385,7 +1386,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
return NULL;
}
- audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+ if (!reserve)
+ audit_backlog_wait_time = audit_backlog_wait_time_master;
ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
@@ -1759,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
} else
audit_log_format(ab, " name=(null)");
- if (n->ino != AUDIT_INO_UNSET) {
+ if (n->ino != AUDIT_INO_UNSET)
audit_log_format(ab, " inode=%lu"
" dev=%02x:%02x mode=%#ho"
" ouid=%u ogid=%u rdev=%02x:%02x",
@@ -1771,7 +1773,6 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
from_kgid(&init_user_ns, n->gid),
MAJOR(n->rdev),
MINOR(n->rdev));
- }
if (n->osid != 0) {
char *ctx = NULL;
u32 len;
@@ -1838,11 +1839,29 @@ error_path:
}
EXPORT_SYMBOL(audit_log_task_context);
+void audit_log_d_path_exe(struct audit_buffer *ab,
+ struct mm_struct *mm)
+{
+ struct file *exe_file;
+
+ if (!mm)
+ goto out_null;
+
+ exe_file = get_mm_exe_file(mm);
+ if (!exe_file)
+ goto out_null;
+
+ audit_log_d_path(ab, " exe=", &exe_file->f_path);
+ fput(exe_file);
+ return;
+out_null:
+ audit_log_format(ab, " exe=(null)");
+}
+
void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
{
const struct cred *cred;
char comm[sizeof(tsk->comm)];
- struct mm_struct *mm = tsk->mm;
char *tty;
if (!ab)
@@ -1878,13 +1897,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
audit_log_format(ab, " comm=");
audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
- if (mm) {
- down_read(&mm->mmap_sem);
- if (mm->exe_file)
- audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
- up_read(&mm->mmap_sem);
- } else
- audit_log_format(ab, " exe=(null)");
+ audit_log_d_path_exe(ab, tsk->mm);
audit_log_task_context(ab);
}
EXPORT_SYMBOL(audit_log_task_info);
@@ -1915,7 +1928,7 @@ void audit_log_link_denied(const char *operation, struct path *link)
/* Generate AUDIT_PATH record with object. */
name->type = AUDIT_TYPE_NORMAL;
- audit_copy_inode(name, link->dentry, link->dentry->d_inode);
+ audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry));
audit_log_name(current->audit_context, name, link, 0, NULL);
out:
kfree(name);
diff --git a/kernel/audit.h b/kernel/audit.h
index 24ec86145667..dadf86a0e59e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -259,6 +259,9 @@ extern struct list_head audit_filter_list[];
extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
+extern void audit_log_d_path_exe(struct audit_buffer *ab,
+ struct mm_struct *mm);
+
/* audit watch functions */
#ifdef CONFIG_AUDIT_WATCH
extern void audit_put_watch(struct audit_watch *watch);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f41722506808..94ecdabda8e6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -37,6 +37,7 @@ struct audit_chunk {
static LIST_HEAD(tree_list);
static LIST_HEAD(prune_list);
+static struct task_struct *prune_thread;
/*
* One struct chunk is attached to each inode of interest.
@@ -578,7 +579,7 @@ int audit_remove_tree_rule(struct audit_krule *rule)
static int compare_root(struct vfsmount *mnt, void *arg)
{
- return mnt->mnt_root->d_inode == arg;
+ return d_backing_inode(mnt->mnt_root) == arg;
}
void audit_trim_trees(void)
@@ -650,7 +651,58 @@ void audit_put_tree(struct audit_tree *tree)
static int tag_mount(struct vfsmount *mnt, void *arg)
{
- return tag_chunk(mnt->mnt_root->d_inode, arg);
+ return tag_chunk(d_backing_inode(mnt->mnt_root), arg);
+}
+
+/*
+ * That gets run when evict_chunk() ends up needing to kill audit_tree.
+ * Runs from a separate thread.
+ */
+static int prune_tree_thread(void *unused)
+{
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty(&prune_list))
+ schedule();
+ __set_current_state(TASK_RUNNING);
+
+ mutex_lock(&audit_cmd_mutex);
+ mutex_lock(&audit_filter_mutex);
+
+ while (!list_empty(&prune_list)) {
+ struct audit_tree *victim;
+
+ victim = list_entry(prune_list.next,
+ struct audit_tree, list);
+ list_del_init(&victim->list);
+
+ mutex_unlock(&audit_filter_mutex);
+
+ prune_one(victim);
+
+ mutex_lock(&audit_filter_mutex);
+ }
+
+ mutex_unlock(&audit_filter_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+ }
+ return 0;
+}
+
+static int audit_launch_prune(void)
+{
+ if (prune_thread)
+ return 0;
+ prune_thread = kthread_create(prune_tree_thread, NULL,
+ "audit_prune_tree");
+ if (IS_ERR(prune_thread)) {
+ pr_err("cannot start thread audit_prune_tree");
+ prune_thread = NULL;
+ return -ENOMEM;
+ } else {
+ wake_up_process(prune_thread);
+ return 0;
+ }
}
/* called with audit_filter_mutex */
@@ -676,6 +728,12 @@ int audit_add_tree_rule(struct audit_krule *rule)
/* do not set rule->tree yet */
mutex_unlock(&audit_filter_mutex);
+ if (unlikely(!prune_thread)) {
+ err = audit_launch_prune();
+ if (err)
+ goto Err;
+ }
+
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
@@ -813,36 +871,10 @@ int audit_tag_tree(char *old, char *new)
return failed;
}
-/*
- * That gets run when evict_chunk() ends up needing to kill audit_tree.
- * Runs from a separate thread.
- */
-static int prune_tree_thread(void *unused)
-{
- mutex_lock(&audit_cmd_mutex);
- mutex_lock(&audit_filter_mutex);
-
- while (!list_empty(&prune_list)) {
- struct audit_tree *victim;
-
- victim = list_entry(prune_list.next, struct audit_tree, list);
- list_del_init(&victim->list);
-
- mutex_unlock(&audit_filter_mutex);
-
- prune_one(victim);
-
- mutex_lock(&audit_filter_mutex);
- }
-
- mutex_unlock(&audit_filter_mutex);
- mutex_unlock(&audit_cmd_mutex);
- return 0;
-}
static void audit_schedule_prune(void)
{
- kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+ wake_up_process(prune_thread);
}
/*
@@ -909,9 +941,9 @@ static void evict_chunk(struct audit_chunk *chunk)
for (n = 0; n < chunk->count; n++)
list_del_init(&chunk->owners[n].list);
spin_unlock(&hash_lock);
+ mutex_unlock(&audit_filter_mutex);
if (need_prune)
audit_schedule_prune();
- mutex_unlock(&audit_filter_mutex);
}
static int audit_tree_handle_event(struct fsnotify_group *group,
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 359035caac88..656c7e93ac0d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -146,7 +146,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
/* Initialize a parent watch entry. */
static struct audit_parent *audit_init_parent(struct path *path)
{
- struct inode *inode = path->dentry->d_inode;
+ struct inode *inode = d_backing_inode(path->dentry);
struct audit_parent *parent;
int ret;
@@ -364,11 +364,11 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- mutex_unlock(&parent->dentry->d_inode->i_mutex);
- if (d->d_inode) {
+ mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
+ if (d_is_positive(d)) {
/* update watch filter fields */
- watch->dev = d->d_inode->i_sb->s_dev;
- watch->ino = d->d_inode->i_ino;
+ watch->dev = d_backing_inode(d)->i_sb->s_dev;
+ watch->ino = d_backing_inode(d)->i_ino;
}
dput(d);
return 0;
@@ -430,7 +430,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
return ret;
/* either find an old parent or attach a new one */
- parent = audit_find_parent(parent_path.dentry->d_inode);
+ parent = audit_find_parent(d_backing_inode(parent_path.dentry));
if (!parent) {
parent = audit_init_parent(&parent_path);
if (IS_ERR(parent)) {
@@ -483,7 +483,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
switch (data_type) {
case (FSNOTIFY_EVENT_PATH):
- inode = ((struct path *)data)->dentry->d_inode;
+ inode = d_backing_inode(((struct path *)data)->dentry);
break;
case (FSNOTIFY_EVENT_INODE):
inode = (struct inode *)data;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9b56b7ae053f..b86cc04959de 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1024,8 +1024,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
* for strings that are too long, we should not have created
* any.
*/
- if (unlikely((len == 0) || len > MAX_ARG_STRLEN - 1)) {
- WARN_ON(1);
+ if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
send_sig(SIGKILL, current, 0);
return -1;
}
@@ -1630,7 +1629,7 @@ retry:
rcu_read_lock();
seq = read_seqbegin(&rename_lock);
for(;;) {
- struct inode *inode = d->d_inode;
+ struct inode *inode = d_backing_inode(d);
if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
struct audit_chunk *chunk;
chunk = audit_tree_lookup(inode);
@@ -1755,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
unsigned int flags)
{
struct audit_context *context = current->audit_context;
- const struct inode *inode = dentry->d_inode;
+ const struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
@@ -1854,7 +1853,7 @@ void __audit_inode_child(const struct inode *parent,
const unsigned char type)
{
struct audit_context *context = current->audit_context;
- const struct inode *inode = dentry->d_inode;
+ const struct inode *inode = d_backing_inode(dentry);
const char *dname = dentry->d_name.name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
@@ -2362,7 +2361,6 @@ static void audit_log_task(struct audit_buffer *ab)
kuid_t auid, uid;
kgid_t gid;
unsigned int sessionid;
- struct mm_struct *mm = current->mm;
char comm[sizeof(current->comm)];
auid = audit_get_loginuid(current);
@@ -2377,13 +2375,7 @@ static void audit_log_task(struct audit_buffer *ab)
audit_log_task_context(ab);
audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
audit_log_untrustedstring(ab, get_task_comm(comm, current));
- if (mm) {
- down_read(&mm->mmap_sem);
- if (mm->exe_file)
- audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
- up_read(&mm->mmap_sem);
- } else
- audit_log_format(ab, " exe=(null)");
+ audit_log_d_path_exe(ab, current->mm);
}
/**
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a5ae60f0b0a2..e6983be12bd3 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,2 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
-ifdef CONFIG_TEST_BPF
-obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
-endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 9eb4d8a7cd87..29ace107f236 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/mm.h>
-
-struct bpf_array {
- struct bpf_map map;
- u32 elem_size;
- char value[0] __aligned(8);
-};
+#include <linux/filter.h>
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -134,7 +129,7 @@ static void array_map_free(struct bpf_map *map)
kvfree(array);
}
-static struct bpf_map_ops array_ops = {
+static const struct bpf_map_ops array_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
@@ -143,14 +138,195 @@ static struct bpf_map_ops array_ops = {
.map_delete_elem = array_map_delete_elem,
};
-static struct bpf_map_type_list tl = {
+static struct bpf_map_type_list array_type __read_mostly = {
.ops = &array_ops,
.type = BPF_MAP_TYPE_ARRAY,
};
static int __init register_array_map(void)
{
- bpf_register_map_type(&tl);
+ bpf_register_map_type(&array_type);
return 0;
}
late_initcall(register_array_map);
+
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
+{
+ /* only file descriptors can be stored in this type of map */
+ if (attr->value_size != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+ return array_map_alloc(attr);
+}
+
+static void fd_array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* make sure it's empty */
+ for (i = 0; i < array->map.max_entries; i++)
+ BUG_ON(array->ptrs[i] != NULL);
+ kvfree(array);
+}
+
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+/* only called from syscall */
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *new_ptr, *old_ptr;
+ u32 index = *(u32 *)key, ufd;
+
+ if (map_flags != BPF_ANY)
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ ufd = *(u32 *)value;
+ new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+ if (IS_ERR(new_ptr))
+ return PTR_ERR(new_ptr);
+
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ if (old_ptr)
+ map->ops->map_fd_put_ptr(old_ptr);
+
+ return 0;
+}
+
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *old_ptr;
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ old_ptr = xchg(array->ptrs + index, NULL);
+ if (old_ptr) {
+ map->ops->map_fd_put_ptr(old_ptr);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog = bpf_prog_get(fd);
+ if (IS_ERR(prog))
+ return prog;
+
+ if (!bpf_prog_array_compatible(array, prog)) {
+ bpf_prog_put(prog);
+ return ERR_PTR(-EINVAL);
+ }
+ return prog;
+}
+
+static void prog_fd_array_put_ptr(void *ptr)
+{
+ struct bpf_prog *prog = ptr;
+
+ bpf_prog_put_rcu(prog);
+}
+
+/* decrement refcnt of all bpf_progs that are stored in this map */
+void bpf_fd_array_map_clear(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ fd_array_map_delete_elem(map, &i);
+}
+
+static const struct bpf_map_ops prog_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = fd_array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_update_elem = fd_array_map_update_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = prog_fd_array_get_ptr,
+ .map_fd_put_ptr = prog_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list prog_array_type __read_mostly = {
+ .ops = &prog_array_ops,
+ .type = BPF_MAP_TYPE_PROG_ARRAY,
+};
+
+static int __init register_prog_array_map(void)
+{
+ bpf_register_map_type(&prog_array_type);
+ return 0;
+}
+late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+ struct perf_event *event;
+ const struct perf_event_attr *attr;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return event;
+
+ attr = perf_event_attrs(event);
+ if (IS_ERR(attr))
+ return (void *)attr;
+
+ if (attr->type != PERF_TYPE_RAW &&
+ attr->type != PERF_TYPE_HARDWARE) {
+ perf_event_release_kernel(event);
+ return ERR_PTR(-EINVAL);
+ }
+ return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+ struct perf_event *event = ptr;
+
+ perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = perf_event_array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_update_elem = fd_array_map_update_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = perf_event_fd_array_get_ptr,
+ .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+ .ops = &perf_event_array_ops,
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+ bpf_register_map_type(&perf_event_array_type);
+ return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a64e7a207d2b..67c380cfa9ca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -26,9 +26,10 @@
#include <linux/vmalloc.h>
#include <linux/random.h>
#include <linux/moduleloader.h>
-#include <asm/unaligned.h>
#include <linux/bpf.h>
+#include <asm/unaligned.h>
+
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
#define BPF_R1 regs[BPF_REG_1]
@@ -62,6 +63,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
else if (k >= SKF_LL_OFF)
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
@@ -175,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
return 0;
}
+EXPORT_SYMBOL_GPL(__bpf_call_base);
/**
* __bpf_prog_run - run eBPF program on a given context
@@ -244,6 +247,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +290,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
};
+ u32 tail_call_cnt = 0;
void *ptr;
int off;
@@ -357,8 +362,8 @@ select_insn:
ALU64_MOD_X:
if (unlikely(SRC == 0))
return 0;
- tmp = DST;
- DST = do_div(tmp, SRC);
+ div64_u64_rem(DST, SRC, &tmp);
+ DST = tmp;
CONT;
ALU_MOD_X:
if (unlikely(SRC == 0))
@@ -367,8 +372,8 @@ select_insn:
DST = do_div(tmp, (u32) SRC);
CONT;
ALU64_MOD_K:
- tmp = DST;
- DST = do_div(tmp, IMM);
+ div64_u64_rem(DST, IMM, &tmp);
+ DST = tmp;
CONT;
ALU_MOD_K:
tmp = (u32) DST;
@@ -377,7 +382,7 @@ select_insn:
ALU64_DIV_X:
if (unlikely(SRC == 0))
return 0;
- do_div(DST, SRC);
+ DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
if (unlikely(SRC == 0))
@@ -387,7 +392,7 @@ select_insn:
DST = (u32) tmp;
CONT;
ALU64_DIV_K:
- do_div(DST, IMM);
+ DST = div64_u64(DST, IMM);
CONT;
ALU_DIV_K:
tmp = (u32) DST;
@@ -431,6 +436,34 @@ select_insn:
BPF_R4, BPF_R5);
CONT;
+ JMP_TAIL_CALL: {
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog;
+ u64 index = BPF_R3;
+
+ if (unlikely(index >= array->map.max_entries))
+ goto out;
+
+ if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+ goto out;
+
+ tail_call_cnt++;
+
+ prog = READ_ONCE(array->ptrs[index]);
+ if (unlikely(!prog))
+ goto out;
+
+ /* ARG1 at this point is guaranteed to point to CTX from
+ * the verifier side due to the fact that the tail call is
+ * handeled like a helper, that is, bpf_tail_call_proto,
+ * where arg1_type is ARG_PTR_TO_CTX.
+ */
+ insn = prog->insnsi;
+ goto select_insn;
+out:
+ CONT;
+ }
/* JMP */
JMP_JA:
insn += insn->off;
@@ -615,25 +648,63 @@ load_byte:
return 0;
}
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+bool bpf_prog_array_compatible(struct bpf_array *array,
+ const struct bpf_prog *fp)
{
+ if (!array->owner_prog_type) {
+ /* There's no owner yet where we could check for
+ * compatibility.
+ */
+ array->owner_prog_type = fp->type;
+ array->owner_jited = fp->jited;
+
+ return true;
+ }
+
+ return array->owner_prog_type == fp->type &&
+ array->owner_jited == fp->jited;
+}
+
+static int bpf_check_tail_call(const struct bpf_prog *fp)
+{
+ struct bpf_prog_aux *aux = fp->aux;
+ int i;
+
+ for (i = 0; i < aux->used_map_cnt; i++) {
+ struct bpf_map *map = aux->used_maps[i];
+ struct bpf_array *array;
+
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ continue;
+
+ array = container_of(map, struct bpf_array, map);
+ if (!bpf_prog_array_compatible(array, fp))
+ return -EINVAL;
+ }
+
+ return 0;
}
/**
- * bpf_prog_select_runtime - select execution runtime for BPF program
+ * bpf_prog_select_runtime - select exec runtime for BPF program
* @fp: bpf_prog populated with internal BPF program
*
- * try to JIT internal BPF program, if JIT is not available select interpreter
- * BPF program will be executed via BPF_PROG_RUN() macro
+ * Try to JIT eBPF program, if JIT is not available, use interpreter.
+ * The BPF program will be executed via BPF_PROG_RUN() macro.
*/
-void bpf_prog_select_runtime(struct bpf_prog *fp)
+int bpf_prog_select_runtime(struct bpf_prog *fp)
{
fp->bpf_func = (void *) __bpf_prog_run;
- /* Probe if internal BPF can be JITed */
bpf_int_jit_compile(fp);
- /* Lock whole bpf_prog as read-only */
bpf_prog_lock_ro(fp);
+
+ /* The tail call compatibility check can only be done at
+ * this late stage as we need to determine, if we deal
+ * with JITed or non JITed program concatenations and not
+ * all eBPF JITs might immediately support all features.
+ */
+ return bpf_check_tail_call(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
@@ -656,6 +727,37 @@ void bpf_prog_free(struct bpf_prog *fp)
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
+/* Weak definitions of helper functions in case we don't have bpf syscall. */
+const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
+const struct bpf_func_proto bpf_map_update_elem_proto __weak;
+const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
+
+const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
+const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
+const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
+{
+ return NULL;
+}
+
+/* Always built-in helper functions. */
+const struct bpf_func_proto bpf_tail_call_proto = {
+ .func = NULL,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
+void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b3ba43674310..83c209d9b17a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -345,7 +345,7 @@ static void htab_map_free(struct bpf_map *map)
kfree(htab);
}
-static struct bpf_map_ops htab_ops = {
+static const struct bpf_map_ops htab_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
@@ -354,14 +354,14 @@ static struct bpf_map_ops htab_ops = {
.map_delete_elem = htab_map_delete_elem,
};
-static struct bpf_map_type_list tl = {
+static struct bpf_map_type_list htab_type __read_mostly = {
.ops = &htab_ops,
.type = BPF_MAP_TYPE_HASH,
};
static int __init register_htab_map(void)
{
- bpf_register_map_type(&tl);
+ bpf_register_map_type(&htab_type);
return 0;
}
late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9e3414d85459..1447ec09421e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -11,6 +11,11 @@
*/
#include <linux/bpf.h>
#include <linux/rcupdate.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
+#include <linux/uidgid.h>
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -41,12 +46,12 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
return (unsigned long) value;
}
-struct bpf_func_proto bpf_map_lookup_elem_proto = {
- .func = bpf_map_lookup_elem,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
+const struct bpf_func_proto bpf_map_lookup_elem_proto = {
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
};
static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -60,14 +65,14 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
return map->ops->map_update_elem(map, key, value, r4);
}
-struct bpf_func_proto bpf_map_update_elem_proto = {
- .func = bpf_map_update_elem,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
- .arg3_type = ARG_PTR_TO_MAP_VALUE,
- .arg4_type = ARG_ANYTHING,
+const struct bpf_func_proto bpf_map_update_elem_proto = {
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
};
static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -80,10 +85,100 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
return map->ops->map_delete_elem(map, key);
}
-struct bpf_func_proto bpf_map_delete_elem_proto = {
- .func = bpf_map_delete_elem,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
+const struct bpf_func_proto bpf_map_delete_elem_proto = {
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return prandom_u32();
+}
+
+const struct bpf_func_proto bpf_get_prandom_u32_proto = {
+ .func = bpf_get_prandom_u32,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return raw_smp_processor_id();
+}
+
+const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
+ .func = bpf_get_smp_processor_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* NMI safe access to clock monotonic */
+ return ktime_get_mono_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+ .func = bpf_ktime_get_ns,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+
+ if (!task)
+ return -EINVAL;
+
+ return (u64) task->tgid << 32 | task->pid;
+}
+
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
+ .func = bpf_get_current_pid_tgid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+ kuid_t uid;
+ kgid_t gid;
+
+ if (!task)
+ return -EINVAL;
+
+ current_uid_gid(&uid, &gid);
+ return (u64) from_kgid(&init_user_ns, gid) << 32 |
+ from_kuid(&init_user_ns, uid);
+}
+
+const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
+ .func = bpf_get_current_uid_gid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+ char *buf = (char *) (long) r1;
+
+ if (!task)
+ return -EINVAL;
+
+ memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+ return 0;
+}
+
+const struct bpf_func_proto bpf_get_current_comm_proto = {
+ .func = bpf_get_current_comm,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..dc9b464fefa9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
#include <linux/file.h>
#include <linux/license.h>
#include <linux/filter.h>
+#include <linux/version.h>
static LIST_HEAD(bpf_map_types);
@@ -67,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
{
struct bpf_map *map = filp->private_data;
+ if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+ /* prog_array stores refcnt-ed bpf_prog pointers
+ * release them all when user space closes prog_array_fd
+ */
+ bpf_fd_array_map_clear(map);
+
bpf_map_put(map);
return 0;
}
@@ -354,10 +361,11 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
list_for_each_entry(tl, &bpf_prog_types, list_node) {
if (tl->type == type) {
prog->aux->ops = tl->ops;
- prog->aux->prog_type = type;
+ prog->type = type;
return 0;
}
}
+
return -EINVAL;
}
@@ -390,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
*/
BUG_ON(!prog->aux->ops->get_func_proto);
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* mark bpf_tail_call as different opcode
+ * to avoid conditional branch in
+ * interpeter for every normal call
+ * and to prevent accidental JITing by
+ * JIT compiler that doesn't support
+ * bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code |= BPF_X;
+ continue;
+ }
+
fn = prog->aux->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
@@ -411,6 +432,23 @@ static void free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
+static void __prog_put_rcu(struct rcu_head *rcu)
+{
+ struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+
+ free_used_maps(aux);
+ bpf_prog_free(aux->prog);
+}
+
+/* version of bpf_prog_put() that is called after a grace period */
+void bpf_prog_put_rcu(struct bpf_prog *prog)
+{
+ if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ prog->aux->prog = prog;
+ call_rcu(&prog->aux->rcu, __prog_put_rcu);
+ }
+}
+
void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
@@ -418,12 +456,13 @@ void bpf_prog_put(struct bpf_prog *prog)
bpf_prog_free(prog);
}
}
+EXPORT_SYMBOL_GPL(bpf_prog_put);
static int bpf_prog_release(struct inode *inode, struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
- bpf_prog_put(prog);
+ bpf_prog_put_rcu(prog);
return 0;
}
@@ -465,9 +504,10 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
fdput(f);
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_get);
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD log_buf
+#define BPF_PROG_LOAD_LAST_FIELD kern_version
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -492,6 +532,10 @@ static int bpf_prog_load(union bpf_attr *attr)
if (attr->insn_cnt >= BPF_MAXINSNS)
return -EINVAL;
+ if (type == BPF_PROG_TYPE_KPROBE &&
+ attr->kern_version != LINUX_VERSION_CODE)
+ return -EINVAL;
+
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
@@ -508,7 +552,7 @@ static int bpf_prog_load(union bpf_attr *attr)
prog->jited = false;
atomic_set(&prog->aux->refcnt, 1);
- prog->aux->is_gpl_compatible = is_gpl;
+ prog->gpl_compatible = is_gpl;
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
@@ -516,8 +560,7 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog;
/* run eBPF verifier */
- err = bpf_check(prog, attr);
-
+ err = bpf_check(&prog, attr);
if (err < 0)
goto free_used_maps;
@@ -525,10 +568,11 @@ static int bpf_prog_load(union bpf_attr *attr)
fixup_bpf_calls(prog);
/* eBPF program is ready to be JITed */
- bpf_prog_select_runtime(prog);
+ err = bpf_prog_select_runtime(prog);
+ if (err < 0)
+ goto free_used_maps;
err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
-
if (err < 0)
/* failed to allocate fd */
goto free_used_maps;
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
deleted file mode 100644
index 0ceae1e6e8b5..000000000000
--- a/kernel/bpf/test_stub.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/bpf.h>
-
-/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
- * to be used by user space verifier testsuite
- */
-struct bpf_context {
- u64 arg1;
- u64 arg2;
-};
-
-static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
-{
- switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- default:
- return NULL;
- }
-}
-
-static const struct bpf_context_access {
- int size;
- enum bpf_access_type type;
-} test_ctx_access[] = {
- [offsetof(struct bpf_context, arg1)] = {
- FIELD_SIZEOF(struct bpf_context, arg1),
- BPF_READ
- },
- [offsetof(struct bpf_context, arg2)] = {
- FIELD_SIZEOF(struct bpf_context, arg2),
- BPF_READ
- },
-};
-
-static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
-{
- const struct bpf_context_access *access;
-
- if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
- return false;
-
- access = &test_ctx_access[off];
- if (access->size == size && (access->type & type))
- return true;
-
- return false;
-}
-
-static struct bpf_verifier_ops test_ops = {
- .get_func_proto = test_func_proto,
- .is_valid_access = test_is_valid_access,
-};
-
-static struct bpf_prog_type_list tl_prog = {
- .ops = &test_ops,
- .type = BPF_PROG_TYPE_UNSPEC,
-};
-
-static int __init register_test_ops(void)
-{
- bpf_register_prog_type(&tl_prog);
- return 0;
-}
-late_initcall(register_test_ops);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a28e09c7825d..ed12e385fb75 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
[CONST_IMM] = "imm",
};
+static const struct {
+ int map_type;
+ int func_id;
+} func_limit[] = {
+ {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+ {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
static void print_verifier_state(struct verifier_env *env)
{
enum bpf_reg_type t;
@@ -648,6 +656,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
struct verifier_state *state = &env->cur_state;
int size, err = 0;
+ if (state->regs[regno].type == PTR_TO_STACK)
+ off += state->regs[regno].imm;
+
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
@@ -667,7 +678,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
- } else if (state->regs[regno].type == FRAME_PTR) {
+ } else if (state->regs[regno].type == FRAME_PTR ||
+ state->regs[regno].type == PTR_TO_STACK) {
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
@@ -755,7 +767,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
enum bpf_reg_type expected_type;
int err = 0;
- if (arg_type == ARG_ANYTHING)
+ if (arg_type == ARG_DONTCARE)
return 0;
if (reg->type == NOT_INIT) {
@@ -763,6 +775,9 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return -EACCES;
}
+ if (arg_type == ARG_ANYTHING)
+ return 0;
+
if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
@@ -770,6 +785,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
expected_type = CONST_IMM;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
+ } else if (arg_type == ARG_PTR_TO_CTX) {
+ expected_type = PTR_TO_CTX;
} else {
verbose("unsupported arg_type %d\n", arg_type);
return -EFAULT;
@@ -828,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return err;
}
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+ bool bool_map, bool_func;
+ int i;
+
+ if (!map)
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
+ bool_map = (map->map_type == func_limit[i].map_type);
+ bool_func = (func_id == func_limit[i].func_id);
+ /* only when map & func pair match it can continue.
+ * don't allow any other map type to be passed into
+ * the special func;
+ */
+ if (bool_map != bool_func)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int check_call(struct verifier_env *env, int func_id)
{
struct verifier_state *state = &env->cur_state;
@@ -852,7 +891,7 @@ static int check_call(struct verifier_env *env, int func_id)
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
- if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
+ if (!env->prog->gpl_compatible && fn->gpl_only) {
verbose("cannot call GPL only function from proprietary program\n");
return -EINVAL;
}
@@ -902,6 +941,11 @@ static int check_call(struct verifier_env *env, int func_id)
fn->ret_type, func_id);
return -EINVAL;
}
+
+ err = check_map_func_compatibility(map, func_id);
+ if (err)
+ return err;
+
return 0;
}
@@ -1172,6 +1216,18 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
return 0;
}
+static bool may_access_skb(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return true;
+ default:
+ return false;
+ }
+}
+
/* verify safety of LD_ABS|LD_IND instructions:
* - they can only appear in the programs where ctx == skb
* - since they are wrappers of function calls, they scratch R1-R5 registers,
@@ -1194,8 +1250,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
struct reg_state *reg;
int i, err;
- if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
- verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
+ if (!may_access_skb(env->prog->type)) {
+ verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
return -EINVAL;
}
@@ -1380,7 +1436,8 @@ peek_stack:
/* tell verifier to check for equivalent states
* after every call and jump
*/
- env->explored_states[t + 1] = STATE_LIST_MARK;
+ if (t + 1 < insn_cnt)
+ env->explored_states[t + 1] = STATE_LIST_MARK;
} else {
/* conditional jump with two edges */
ret = push_insn(t, t + 1, FALLTHROUGH, env);
@@ -1606,11 +1663,10 @@ static int do_check(struct verifier_env *env)
return err;
} else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->imm != 0) {
- verbose("BPF_LDX uses reserved fields\n");
- return -EINVAL;
- }
+ enum bpf_reg_type src_reg_type;
+
+ /* check for reserved fields is already done */
+
/* check src operand */
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
if (err)
@@ -1620,6 +1676,8 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ src_reg_type = regs[insn->src_reg].type;
+
/* check that memory (src_reg + off) is readable,
* the state of dst_reg will be updated by this func
*/
@@ -1629,7 +1687,35 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ if (BPF_SIZE(insn->code) != BPF_W) {
+ insn_idx++;
+ continue;
+ }
+
+ if (insn->imm == 0) {
+ /* saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * use reserved 'imm' field to mark this insn
+ */
+ insn->imm = src_reg_type;
+
+ } else if (src_reg_type != insn->imm &&
+ (src_reg_type == PTR_TO_CTX ||
+ insn->imm == PTR_TO_CTX)) {
+ /* ABuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ verbose("same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+
} else if (class == BPF_STX) {
+ enum bpf_reg_type dst_reg_type;
+
if (BPF_MODE(insn->code) == BPF_XADD) {
err = check_xadd(env, insn);
if (err)
@@ -1638,11 +1724,6 @@ static int do_check(struct verifier_env *env)
continue;
}
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->imm != 0) {
- verbose("BPF_STX uses reserved fields\n");
- return -EINVAL;
- }
/* check src1 operand */
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
if (err)
@@ -1652,6 +1733,8 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ dst_reg_type = regs[insn->dst_reg].type;
+
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
@@ -1659,6 +1742,15 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ if (insn->imm == 0) {
+ insn->imm = dst_reg_type;
+ } else if (dst_reg_type != insn->imm &&
+ (dst_reg_type == PTR_TO_CTX ||
+ insn->imm == PTR_TO_CTX)) {
+ verbose("same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
@@ -1776,6 +1868,19 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
int i, j;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
+ verbose("BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ ((BPF_MODE(insn->code) != BPF_MEM &&
+ BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_map *map;
struct fd f;
@@ -1867,6 +1972,97 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
insn->src_reg = 0;
}
+static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ int insn_cnt = prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (BPF_CLASS(insn->code) != BPF_JMP ||
+ BPF_OP(insn->code) == BPF_CALL ||
+ BPF_OP(insn->code) == BPF_EXIT)
+ continue;
+
+ /* adjust offset of jmps if necessary */
+ if (i < pos && i + insn->off + 1 > pos)
+ insn->off += delta;
+ else if (i > pos && i + insn->off + 1 < pos)
+ insn->off -= delta;
+ }
+}
+
+/* convert load instructions that access fields of 'struct __sk_buff'
+ * into sequence of instructions that access fields of 'struct sk_buff'
+ */
+static int convert_ctx_accesses(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ u32 cnt;
+ int i;
+ enum bpf_access_type type;
+
+ if (!env->prog->aux->ops->convert_ctx_access)
+ return 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+ type = BPF_READ;
+ else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+ type = BPF_WRITE;
+ else
+ continue;
+
+ if (insn->imm != PTR_TO_CTX) {
+ /* clear internal mark */
+ insn->imm = 0;
+ continue;
+ }
+
+ cnt = env->prog->aux->ops->
+ convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+ insn->off, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ if (cnt == 1) {
+ memcpy(insn, insn_buf, sizeof(*insn));
+ continue;
+ }
+
+ /* several new insns need to be inserted. Make room for them */
+ insn_cnt += cnt - 1;
+ new_prog = bpf_prog_realloc(env->prog,
+ bpf_prog_size(insn_cnt),
+ GFP_USER);
+ if (!new_prog)
+ return -ENOMEM;
+
+ new_prog->len = insn_cnt;
+
+ memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
+ sizeof(*insn) * (insn_cnt - i - cnt));
+
+ /* copy substitute insns in place of load instruction */
+ memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
+
+ /* adjust branches in the whole program */
+ adjust_branches(new_prog, i, cnt - 1);
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + cnt - 1;
+ i += cnt - 1;
+ }
+
+ return 0;
+}
+
static void free_states(struct verifier_env *env)
{
struct verifier_state_list *sl, *sln;
@@ -1889,13 +2085,13 @@ static void free_states(struct verifier_env *env)
kfree(env->explored_states);
}
-int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
char __user *log_ubuf = NULL;
struct verifier_env *env;
int ret = -EINVAL;
- if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+ if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
return -E2BIG;
/* 'struct verifier_env' can be global, but since it's not small,
@@ -1905,7 +2101,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (!env)
return -ENOMEM;
- env->prog = prog;
+ env->prog = *prog;
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);
@@ -1937,7 +2133,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;
- env->explored_states = kcalloc(prog->len,
+ env->explored_states = kcalloc(env->prog->len,
sizeof(struct verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
@@ -1954,6 +2150,10 @@ skip_full_check:
while (pop_stack(env, NULL) >= 0);
free_states(env);
+ if (ret == 0)
+ /* program is valid, convert *(u32*)(ctx + off) accesses */
+ ret = convert_ctx_accesses(env);
+
if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
/* verifier log exceeded user supplied buffer */
@@ -1969,18 +2169,18 @@ skip_full_check:
if (ret == 0 && env->used_map_cnt) {
/* if program passed verifier, update used_maps in bpf_prog_info */
- prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
- sizeof(env->used_maps[0]),
- GFP_KERNEL);
+ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+ sizeof(env->used_maps[0]),
+ GFP_KERNEL);
- if (!prog->aux->used_maps) {
+ if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
goto free_log_buf;
}
- memcpy(prog->aux->used_maps, env->used_maps,
+ memcpy(env->prog->aux->used_maps, env->used_maps,
sizeof(env->used_maps[0]) * env->used_map_cnt);
- prog->aux->used_map_cnt = env->used_map_cnt;
+ env->prog->aux->used_map_cnt = env->used_map_cnt;
/* program is valid. Convert pseudo bpf_ld_imm64 into generic
* bpf_ld_imm64 instructions
@@ -1992,11 +2192,12 @@ free_log_buf:
if (log_level)
vfree(log_buf);
free_env:
- if (!prog->aux->used_maps)
+ if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
*/
release_maps(env);
+ *prog = env->prog;
kfree(env);
mutex_unlock(&bpf_verifier_lock);
return ret;
diff --git a/kernel/capability.c b/kernel/capability.c
index 989f5bfc57dc..45432b54d5c6 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str)
}
__setup("no_file_caps", file_caps_disable);
+#ifdef CONFIG_MULTIUSER
/*
* More recent versions of libcap are available from:
*
@@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap)
}
EXPORT_SYMBOL(ns_capable);
+
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+ return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+#endif /* CONFIG_MULTIUSER */
+
/**
* file_ns_capable - Determine if the file's opener had a capability in effect
* @file: The file we want to check
@@ -412,22 +431,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
EXPORT_SYMBOL(file_ns_capable);
/**
- * capable - Determine if the current task has a superior capability in effect
- * @cap: The capability to be tested for
- *
- * Return true if the current task has the given superior capability currently
- * available for use, false if not.
- *
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
- */
-bool capable(int cap)
-{
- return ns_capable(&init_user_ns, cap);
-}
-EXPORT_SYMBOL(capable);
-
-/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 29a7b2cc593e..a8538e443784 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/rwsem.h>
+#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
@@ -103,9 +104,11 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(release_agent_path_lock);
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
#define cgroup_assert_mutex_or_rcu_locked() \
- rcu_lockdep_assert(rcu_read_lock_held() || \
- lockdep_is_held(&cgroup_mutex), \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
+ !lockdep_is_held(&cgroup_mutex), \
"cgroup_mutex or RCU read lock required");
/*
@@ -142,6 +145,7 @@ static const char *cgroup_subsys_name[] = {
* part of that cgroup.
*/
struct cgroup_root cgrp_dfl_root;
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
* The default hierarchy always exists but is hidden until mounted for the
@@ -156,7 +160,7 @@ static bool cgrp_dfl_root_visible;
static bool cgroup_legacy_files_on_dfl;
/* some controllers are not supported in the default hierarchy */
-static unsigned int cgrp_dfl_root_inhibit_ss_mask;
+static unsigned long cgrp_dfl_root_inhibit_ss_mask;
/* The list of hierarchy roots */
@@ -175,18 +179,22 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
*/
static u64 css_serial_nr_next = 1;
-/* This flag indicates whether tasks in the fork and exit paths should
- * check for fork/exit handlers to call. This avoids us having to do
- * extra work in the fork/exit path if none of the subsystems need to
- * be called.
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
*/
-static int need_forkexit_callback __read_mostly;
+static unsigned long have_fork_callback __read_mostly;
+static unsigned long have_exit_callback __read_mostly;
+
+/* Ditto for the can_fork callback. */
+static unsigned long have_canfork_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned int ss_mask);
+ unsigned long ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
bool visible);
@@ -203,7 +211,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
idr_preload(gfp_mask);
spin_lock_bh(&cgroup_idr_lock);
- ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
spin_unlock_bh(&cgroup_idr_lock);
idr_preload_end();
return ret;
@@ -261,7 +269,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
- * Similar to cgroup_css() but returns the effctive css, which is defined
+ * Similar to cgroup_css() but returns the effective css, which is defined
* as the matching css of the nearest ancestor including self which has @ss
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
* function is guaranteed to return non-NULL css.
@@ -409,6 +417,24 @@ static int notify_on_release(const struct cgroup *cgrp)
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+/**
+ * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_maskp: a pointer to the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * mask is set to 1.
+ */
+#define for_each_subsys_which(ss, ssid, ss_maskp) \
+ if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \
+ (ssid) = 0; \
+ else \
+ for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
+ if (((ss) = cgroup_subsys[ssid]) && false) \
+ break; \
+ else
+
/* iterate across the hierarchies */
#define for_each_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
@@ -882,7 +908,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
static void cgroup_free_root(struct cgroup_root *root)
{
if (root) {
- /* hierarhcy ID shoulid already have been released */
+ /* hierarchy ID should already have been released */
WARN_ON_ONCE(root->hierarchy_id);
idr_destroy(&root->cgroup_idr);
@@ -998,17 +1024,20 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
{
+ struct cgroup_subsys *ss = cft->ss;
+
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
- cft->ss->name, cft->name);
+ cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+ cft->name);
else
strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
@@ -1068,11 +1097,11 @@ static void cgroup_put(struct cgroup *cgrp)
* @subtree_control is to be applied to @cgrp. The returned mask is always
* a superset of @subtree_control and follows the usual hierarchy rules.
*/
-static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
- unsigned int subtree_control)
+static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+ unsigned long subtree_control)
{
struct cgroup *parent = cgroup_parent(cgrp);
- unsigned int cur_ss_mask = subtree_control;
+ unsigned long cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
@@ -1082,11 +1111,10 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
return cur_ss_mask;
while (true) {
- unsigned int new_ss_mask = cur_ss_mask;
+ unsigned long new_ss_mask = cur_ss_mask;
- for_each_subsys(ss, ssid)
- if (cur_ss_mask & (1 << ssid))
- new_ss_mask |= ss->depends_on;
+ for_each_subsys_which(ss, ssid, &cur_ss_mask)
+ new_ss_mask |= ss->depends_on;
/*
* Mask out subsystems which aren't available. This can
@@ -1200,7 +1228,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be removed
*/
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
struct cgroup_subsys *ss;
int i;
@@ -1215,18 +1243,16 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
}
}
-static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root,
+ unsigned long ss_mask)
{
struct cgroup_subsys *ss;
- unsigned int tmp_ss_mask;
+ unsigned long tmp_ss_mask;
int ssid, i, ret;
lockdep_assert_held(&cgroup_mutex);
- for_each_subsys(ss, ssid) {
- if (!(ss_mask & (1 << ssid)))
- continue;
-
+ for_each_subsys_which(ss, ssid, &ss_mask) {
/* if @ss has non-root csses attached to it, can't move */
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
return -EBUSY;
@@ -1253,7 +1279,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
* Just warn about it and continue.
*/
if (cgrp_dfl_root_visible) {
- pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+ pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
ret, ss_mask);
pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
}
@@ -1263,18 +1289,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
- for_each_subsys(ss, ssid)
- if (ss_mask & (1 << ssid))
- cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
+ for_each_subsys_which(ss, ssid, &ss_mask)
+ cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
- for_each_subsys(ss, ssid) {
+ for_each_subsys_which(ss, ssid, &ss_mask) {
struct cgroup_root *src_root;
struct cgroup_subsys_state *css;
struct css_set *cset;
- if (!(ss_mask & (1 << ssid)))
- continue;
-
src_root = ss->root;
css = cgroup_css(&src_root->cgrp, ss);
@@ -1317,9 +1339,10 @@ static int cgroup_show_options(struct seq_file *seq,
struct cgroup_subsys *ss;
int ssid;
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_printf(seq, ",%s", ss->name);
+ if (root != &cgrp_dfl_root)
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_show_option(seq, ss->name, NULL);
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
@@ -1327,18 +1350,19 @@ static int cgroup_show_options(struct seq_file *seq,
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
- seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ seq_show_option(seq, "release_agent",
+ root->release_agent_path);
spin_unlock(&release_agent_path_lock);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
- seq_printf(seq, ",name=%s", root->name);
+ seq_show_option(seq, "name", root->name);
return 0;
}
struct cgroup_sb_opts {
- unsigned int subsys_mask;
+ unsigned long subsys_mask;
unsigned int flags;
char *release_agent;
bool cpuset_clone_children;
@@ -1351,7 +1375,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
char *token, *o = data;
bool all_ss = false, one_ss = false;
- unsigned int mask = -1U;
+ unsigned long mask = -1UL;
struct cgroup_subsys *ss;
int nr_opts = 0;
int i;
@@ -1432,7 +1456,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
}
for_each_subsys(ss, i) {
- if (strcmp(token, ss->name))
+ if (strcmp(token, ss->legacy_name))
continue;
if (ss->disabled)
continue;
@@ -1495,7 +1519,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
int ret = 0;
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
- unsigned int added_mask, removed_mask;
+ unsigned long added_mask, removed_mask;
if (root == &cgrp_dfl_root) {
pr_err("remount is not allowed\n");
@@ -1641,7 +1665,7 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1651,7 +1675,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
lockdep_assert_held(&cgroup_mutex);
- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
if (ret < 0)
goto out;
root_cgrp->id = ret;
@@ -1924,8 +1948,6 @@ static struct file_system_type cgroup_fs_type = {
.kill_sb = cgroup_kill_sb,
};
-static struct kobject *cgroup_kobj;
-
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
@@ -2052,9 +2074,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
lockdep_assert_held(&css_set_rwsem);
/*
- * We are synchronized through threadgroup_lock() against PF_EXITING
- * setting such that we can't race against cgroup_exit() changing the
- * css_set to init_css_set and dropping the old one.
+ * We are synchronized through cgroup_threadgroup_rwsem against
+ * PF_EXITING setting such that we can't race against cgroup_exit()
+ * changing the css_set to init_css_set and dropping the old one.
*/
WARN_ON_ONCE(tsk->flags & PF_EXITING);
old_cset = task_css_set(tsk);
@@ -2111,10 +2133,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
* @src_cset and add it to @preloaded_csets, which should later be cleaned
* up by cgroup_migrate_finish().
*
- * This function may be called without holding threadgroup_lock even if the
- * target is a process. Threads may be created and destroyed but as long
- * as cgroup_mutex is not dropped, no new css_set can be put into play and
- * the preloaded css_sets are guaranteed to cover all migrations.
+ * This function may be called without holding cgroup_threadgroup_rwsem
+ * even if the target is a process. Threads may be created and destroyed
+ * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ * into play and the preloaded css_sets are guaranteed to cover all
+ * migrations.
*/
static void cgroup_migrate_add_src(struct css_set *src_cset,
struct cgroup *dst_cgrp,
@@ -2217,7 +2240,7 @@ err:
* @threadgroup: whether @leader points to the whole process or a single task
*
* Migrate a process or task denoted by @leader to @cgrp. If migrating a
- * process, the caller must be holding threadgroup_lock of @leader. The
+ * process, the caller must be holding cgroup_threadgroup_rwsem. The
* caller is also responsible for invoking cgroup_migrate_add_src() and
* cgroup_migrate_prepare_dst() on the targets before invoking this
* function and following up with cgroup_migrate_finish().
@@ -2345,7 +2368,7 @@ out_release_tset:
* @leader: the task or the leader of the threadgroup to be attached
* @threadgroup: attach the whole threadgroup?
*
- * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
*/
static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *leader, bool threadgroup)
@@ -2376,6 +2399,47 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
return ret;
}
+static int cgroup_procs_write_permission(struct task_struct *task,
+ struct cgroup *dst_cgrp,
+ struct kernfs_open_file *of)
+{
+ const struct cred *cred = current_cred();
+ const struct cred *tcred = get_task_cred(task);
+ int ret = 0;
+
+ /*
+ * even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+
+ if (!ret && cgroup_on_dfl(dst_cgrp)) {
+ struct super_block *sb = of->file->f_path.dentry->d_sb;
+ struct cgroup *cgrp;
+ struct inode *inode;
+
+ down_read(&css_set_rwsem);
+ cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ up_read(&css_set_rwsem);
+
+ while (!cgroup_is_descendant(dst_cgrp, cgrp))
+ cgrp = cgroup_parent(cgrp);
+
+ ret = -ENOMEM;
+ inode = kernfs_get_inode(sb, cgrp->procs_kn);
+ if (inode) {
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ }
+ }
+
+ put_cred(tcred);
+ return ret;
+}
+
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
* function to attach either it or all tasks in its threadgroup. Will lock
@@ -2385,7 +2449,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
- const struct cred *cred = current_cred(), *tcred;
struct cgroup *cgrp;
pid_t pid;
int ret;
@@ -2397,29 +2460,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
if (!cgrp)
return -ENODEV;
-retry_find_task:
+ percpu_down_write(&cgroup_threadgroup_rwsem);
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
- rcu_read_unlock();
ret = -ESRCH;
- goto out_unlock_cgroup;
+ goto out_unlock_rcu;
}
- /*
- * even if we're attaching all tasks in the thread group, we
- * only need to check permissions on one of them.
- */
- tcred = __task_cred(tsk);
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid)) {
- rcu_read_unlock();
- ret = -EACCES;
- goto out_unlock_cgroup;
- }
- } else
+ } else {
tsk = current;
+ }
if (threadgroup)
tsk = tsk->group_leader;
@@ -2431,35 +2482,23 @@ retry_find_task:
*/
if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
- rcu_read_unlock();
- goto out_unlock_cgroup;
+ goto out_unlock_rcu;
}
get_task_struct(tsk);
rcu_read_unlock();
- threadgroup_lock(tsk);
- if (threadgroup) {
- if (!thread_group_leader(tsk)) {
- /*
- * a race with de_thread from another thread's exec()
- * may strip us of our leadership, if this happens,
- * there is no choice but to throw this task away and
- * try again; this is
- * "double-double-toil-and-trouble-check locking".
- */
- threadgroup_unlock(tsk);
- put_task_struct(tsk);
- goto retry_find_task;
- }
- }
-
- ret = cgroup_attach_task(cgrp, tsk, threadgroup);
-
- threadgroup_unlock(tsk);
+ ret = cgroup_procs_write_permission(tsk, cgrp, of);
+ if (!ret)
+ ret = cgroup_attach_task(cgrp, tsk, threadgroup);
put_task_struct(tsk);
-out_unlock_cgroup:
+ goto out_unlock_threadgroup;
+
+out_unlock_rcu:
+ rcu_read_unlock();
+out_unlock_threadgroup:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
@@ -2542,19 +2581,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
return 0;
}
-static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
{
struct cgroup_subsys *ss;
bool printed = false;
int ssid;
- for_each_subsys(ss, ssid) {
- if (ss_mask & (1 << ssid)) {
- if (printed)
- seq_putc(seq, ' ');
- seq_printf(seq, "%s", ss->name);
- printed = true;
- }
+ for_each_subsys_which(ss, ssid, &ss_mask) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
}
if (printed)
seq_putc(seq, '\n');
@@ -2606,6 +2643,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
/* look up all csses currently attached to @cgrp's subtree */
down_read(&css_set_rwsem);
css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
@@ -2661,17 +2700,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
goto out_finish;
last_task = task;
- threadgroup_lock(task);
- /* raced against de_thread() from another thread? */
- if (!thread_group_leader(task)) {
- threadgroup_unlock(task);
- put_task_struct(task);
- continue;
- }
-
ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
- threadgroup_unlock(task);
put_task_struct(task);
if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -2681,6 +2711,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
out_finish:
cgroup_migrate_finish(&preloaded_csets);
+ percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
@@ -2689,8 +2720,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- unsigned int enable = 0, disable = 0;
- unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
+ unsigned long enable = 0, disable = 0;
+ unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2702,11 +2733,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
*/
buf = strstrip(buf);
while ((tok = strsep(&buf, " "))) {
+ unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
+
if (tok[0] == '\0')
continue;
- for_each_subsys(ss, ssid) {
- if (ss->disabled || strcmp(tok + 1, ss->name) ||
- ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+ for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+ if (ss->disabled || strcmp(tok + 1, ss->name))
continue;
if (*tok == '+') {
@@ -2793,10 +2825,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
* still around. In such cases, wait till it's gone using
* offline_waitq.
*/
- for_each_subsys(ss, ssid) {
- if (!(css_enable & (1 << ssid)))
- continue;
-
+ for_each_subsys_which(ss, ssid, &css_enable) {
cgroup_for_each_live_child(child, cgrp) {
DEFINE_WAIT(wait);
@@ -3087,7 +3116,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
return ret;
}
- if (cft->seq_show == cgroup_populated_show)
+ if (cft->write == cgroup_procs_write)
+ cgrp->procs_kn = kn;
+ else if (cft->seq_show == cgroup_populated_show)
cgrp->populated_kn = kn;
return 0;
}
@@ -3806,10 +3837,7 @@ static void *pidlist_allocate(int count)
static void pidlist_free(void *p)
{
- if (is_vmalloc_addr(p))
- vfree(p);
- else
- kfree(p);
+ kvfree(p);
}
/*
@@ -4199,7 +4227,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
- return seq_printf(s, "%d\n", *(int *)v);
+ seq_printf(s, "%d\n", *(int *)v);
+
+ return 0;
}
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
@@ -4323,7 +4353,7 @@ static struct cftype cgroup_legacy_base_files[] = {
*
* On failure, no file is added.
*/
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
struct cgroup_subsys *ss;
int i, ret = 0;
@@ -4558,7 +4588,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
if (err)
goto err_free_css;
- err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
if (err < 0)
goto err_free_percpu_ref;
css->id = err;
@@ -4635,7 +4665,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
if (cgrp->id < 0) {
ret = -ENOMEM;
goto out_cancel_ref;
@@ -4932,7 +4962,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
* init_css_set is in the subsystem's root cgroup. */
init_css_set.subsys[ss->id] = css;
- need_forkexit_callback |= ss->fork || ss->exit;
+ have_fork_callback |= (bool)ss->fork << ss->id;
+ have_exit_callback |= (bool)ss->exit << ss->id;
+ have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
@@ -4971,6 +5003,8 @@ int __init cgroup_init_early(void)
ss->id = i;
ss->name = cgroup_subsys_name[i];
+ if (!ss->legacy_name)
+ ss->legacy_name = cgroup_subsys_name[i];
if (ss->early_init)
cgroup_init_subsys(ss, true);
@@ -4990,6 +5024,7 @@ int __init cgroup_init(void)
unsigned long key;
int ssid, err;
+ BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
@@ -5040,15 +5075,18 @@ int __init cgroup_init(void)
WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
}
+
+ if (ss->bind)
+ ss->bind(init_css_set.subsys[ssid]);
}
- cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
- if (!cgroup_kobj)
- return -ENOMEM;
+ err = sysfs_create_mount_point(fs_kobj, "cgroup");
+ if (err)
+ return err;
err = register_filesystem(&cgroup_fs_type);
if (err < 0) {
- kobject_put(cgroup_kobj);
+ sysfs_remove_mount_point(fs_kobj, "cgroup");
return err;
}
@@ -5110,9 +5148,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
continue;
seq_printf(m, "%d:", root->hierarchy_id);
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ if (root != &cgrp_dfl_root)
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "",
+ ss->legacy_name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
@@ -5152,7 +5192,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
- ss->name, ss->root->hierarchy_id,
+ ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps), !ss->disabled);
mutex_unlock(&cgroup_mutex);
@@ -5171,6 +5211,19 @@ static const struct file_operations proc_cgroupstats_operations = {
.release = single_release,
};
+static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+ if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+ return &ss_priv[i - CGROUP_CANFORK_START];
+ return NULL;
+}
+
+static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+ void **private = subsys_canfork_priv_p(ss_priv, i);
+ return private ? *private : NULL;
+}
+
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -5186,6 +5239,57 @@ void cgroup_fork(struct task_struct *child)
}
/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i, j, ret;
+
+ for_each_subsys_which(ss, i, &have_canfork_callback) {
+ ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ if (ret)
+ goto out_revert;
+ }
+
+ return 0;
+
+out_revert:
+ for_each_subsys(ss, j) {
+ if (j >= i)
+ break;
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ }
+
+ return ret;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i)
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+}
+
+/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
@@ -5195,7 +5299,8 @@ void cgroup_fork(struct task_struct *child)
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+ void *old_ss_priv[CGROUP_CANFORK_COUNT])
{
struct cgroup_subsys *ss;
int i;
@@ -5239,11 +5344,8 @@ void cgroup_post_fork(struct task_struct *child)
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
- if (need_forkexit_callback) {
- for_each_subsys(ss, i)
- if (ss->fork)
- ss->fork(child);
- }
+ for_each_subsys_which(ss, i, &have_fork_callback)
+ ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
}
/**
@@ -5287,16 +5389,12 @@ void cgroup_exit(struct task_struct *tsk)
cset = task_css_set(tsk);
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
- if (need_forkexit_callback) {
- /* see cgroup_post_fork() for details */
- for_each_subsys(ss, i) {
- if (ss->exit) {
- struct cgroup_subsys_state *old_css = cset->subsys[i];
- struct cgroup_subsys_state *css = task_css(tsk, i);
+ /* see cgroup_post_fork() for details */
+ for_each_subsys_which(ss, i, &have_exit_callback) {
+ struct cgroup_subsys_state *old_css = cset->subsys[i];
+ struct cgroup_subsys_state *css = task_css(tsk, i);
- ss->exit(css, old_css, tsk);
- }
- }
+ ss->exit(css, old_css, tsk);
}
if (put_cset)
@@ -5381,12 +5479,14 @@ static int __init cgroup_disable(char *str)
continue;
for_each_subsys(ss, i) {
- if (!strcmp(token, ss->name)) {
- ss->disabled = 1;
- printk(KERN_INFO "Disabling %s control group"
- " subsystem\n", ss->name);
- break;
- }
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+
+ ss->disabled = 1;
+ printk(KERN_INFO "Disabling %s control group subsystem\n",
+ ss->name);
+ break;
}
}
return 1;
@@ -5451,7 +5551,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
WARN_ON_ONCE(!rcu_read_lock_held());
- return idr_find(&ss->css_idr, id);
+ return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
}
#ifdef CONFIG_CGROUP_DEBUG
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc0ee76..f1b30ad5dc6d 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
{
struct freezer *freezer;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..806cd7693ac8
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,355 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+ struct cgroup_subsys_state css;
+
+ /*
+ * Use 64-bit types so that we can safely represent "max" as
+ * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+ */
+ atomic64_t counter;
+ int64_t limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+ return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct pids_cgroup *pids;
+
+ pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+ if (!pids)
+ return ERR_PTR(-ENOMEM);
+
+ pids->limit = PIDS_MAX;
+ atomic64_set(&pids->counter, 0);
+ return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_pids(css));
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+ /*
+ * A negative count (or overflow for that matter) is invalid,
+ * and indicates a bug in the `pids` controller proper.
+ */
+ WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p, *q;
+
+ for (p = pids; p; p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ /*
+ * Since new is capped to the maximum number of pid_t, if
+ * p->limit is %PIDS_MAX then we know that this test will never
+ * fail.
+ */
+ if (new > p->limit)
+ goto revert;
+ }
+
+ return 0;
+
+revert:
+ for (q = pids; q != p; q = parent_pids(q))
+ pids_cancel(q, num);
+ pids_cancel(p, num);
+
+ return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset) {
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ /*
+ * No need to pin @old_css between here and cancel_attach()
+ * because cgroup core protects it from being freed before
+ * the migration completes or fails.
+ */
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(pids, 1);
+ pids_uncharge(old_pids, 1);
+ }
+
+ return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset) {
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(old_pids, 1);
+ pids_uncharge(pids, 1);
+ }
+}
+
+static int pids_can_fork(struct task_struct *task, void **priv_p)
+{
+ struct cgroup_subsys_state *css;
+ struct pids_cgroup *pids;
+ int err;
+
+ /*
+ * Use the "current" task_css for the pids subsystem as the tentative
+ * css. It is possible we will charge the wrong hierarchy, in which
+ * case we will forcefully revert/reapply the charge on the right
+ * hierarchy after it is committed to the task proper.
+ */
+ css = task_get_css(current, pids_cgrp_id);
+ pids = css_pids(css);
+
+ err = pids_try_charge(pids, 1);
+ if (err)
+ goto err_css_put;
+
+ *priv_p = css;
+ return 0;
+
+err_css_put:
+ css_put(css);
+ return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task, void *priv)
+{
+ struct cgroup_subsys_state *css = priv;
+ struct pids_cgroup *pids = css_pids(css);
+
+ pids_uncharge(pids, 1);
+ css_put(css);
+}
+
+static void pids_fork(struct task_struct *task, void *priv)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state *old_css = priv;
+ struct pids_cgroup *pids;
+ struct pids_cgroup *old_pids = css_pids(old_css);
+
+ css = task_get_css(task, pids_cgrp_id);
+ pids = css_pids(css);
+
+ /*
+ * If the association has changed, we have to revert and reapply the
+ * charge/uncharge on the wrong hierarchy to the current one. Since
+ * the association can only change due to an organisation event, its
+ * okay for us to ignore the limit in this case.
+ */
+ if (pids != old_pids) {
+ pids_uncharge(old_pids, 1);
+ pids_charge(pids, 1);
+ }
+
+ css_put(css);
+ css_put(old_css);
+}
+
+static void pids_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ struct pids_cgroup *pids = css_pids(old_css);
+
+ pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit;
+ int err;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, PIDS_MAX_STR)) {
+ limit = PIDS_MAX;
+ goto set_limit;
+ }
+
+ err = kstrtoll(buf, 0, &limit);
+ if (err)
+ return err;
+
+ if (limit < 0 || limit >= PIDS_MAX)
+ return -EINVAL;
+
+set_limit:
+ /*
+ * Limit updates don't need to be mutex'd, since it isn't
+ * critical that any racing fork()s follow the new limit.
+ */
+ pids->limit = limit;
+ return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(sf);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit = pids->limit;
+
+ if (limit >= PIDS_MAX)
+ seq_printf(sf, "%s\n", PIDS_MAX_STR);
+ else
+ seq_printf(sf, "%lld\n", limit);
+
+ return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return atomic64_read(&pids->counter);
+}
+
+static struct cftype pids_files[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+ .css_alloc = pids_css_alloc,
+ .css_free = pids_css_free,
+ .can_attach = pids_can_attach,
+ .cancel_attach = pids_cancel_attach,
+ .can_fork = pids_can_fork,
+ .cancel_fork = pids_cancel_fork,
+ .fork = pids_fork,
+ .exit = pids_exit,
+ .legacy_cftypes = pids_files,
+ .dfl_cftypes = pids_files,
+};
diff --git a/kernel/compat.c b/kernel/compat.c
index 24f00610c575..333d364be29d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -912,7 +912,8 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
* bitmap. We must however ensure the end of the
* kernel bitmap is zeroed.
*/
- if (nr_compat_longs-- > 0) {
+ if (nr_compat_longs) {
+ nr_compat_longs--;
if (__get_user(um, umask))
return -EFAULT;
} else {
@@ -954,7 +955,8 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
* We dont want to write past the end of the userspace
* bitmap.
*/
- if (nr_compat_longs-- > 0) {
+ if (nr_compat_longs) {
+ nr_compat_longs--;
if (__put_user(um, umask))
return -EFAULT;
}
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
new file mode 100644
index 000000000000..ff756221f112
--- /dev/null
+++ b/kernel/configs/xen.config
@@ -0,0 +1,48 @@
+# global stuff - these enable us to allow some
+# of the not so generic stuff below for xen
+CONFIG_PARAVIRT=y
+CONFIG_NET=y
+CONFIG_NET_CORE=y
+CONFIG_NETDEVICES=y
+CONFIG_BLOCK=y
+CONFIG_WATCHDOG=y
+CONFIG_TARGET_CORE=y
+CONFIG_SCSI=y
+CONFIG_FB=y
+CONFIG_INPUT_MISC=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_TTY=y
+# Technically not required but otherwise produces
+# pretty useless systems starting from allnoconfig
+# You want TCP/IP and ELF binaries right?
+CONFIG_INET=y
+CONFIG_BINFMT_ELF=y
+# generic config
+CONFIG_XEN=y
+CONFIG_XEN_DOM0=y
+# backend drivers
+CONFIG_XEN_BACKEND=y
+CONFIG_XEN_BLKDEV_BACKEND=m
+CONFIG_XEN_NETDEV_BACKEND=m
+CONFIG_HVC_XEN=y
+CONFIG_XEN_WDT=m
+CONFIG_XEN_SCSI_BACKEND=m
+# frontend drivers
+CONFIG_XEN_FBDEV_FRONTEND=m
+CONFIG_HVC_XEN_FRONTEND=y
+CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m
+CONFIG_XEN_SCSI_FRONTEND=m
+# others
+CONFIG_XEN_BALLOON=y
+CONFIG_XEN_SCRUB_PAGES=y
+CONFIG_XEN_DEV_EVTCHN=m
+CONFIG_XEN_BLKDEV_FRONTEND=m
+CONFIG_XEN_NETDEV_FRONTEND=m
+CONFIG_XENFS=m
+CONFIG_XEN_COMPAT_XENFS=y
+CONFIG_XEN_SYS_HYPERVISOR=y
+CONFIG_XEN_XENBUS_FRONTEND=y
+CONFIG_XEN_GNTDEV=m
+CONFIG_XEN_GRANT_DEV_ALLOC=m
+CONFIG_SWIOTLB_XEN=y
+CONFIG_XEN_PRIVCMD=m
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 937ecdfdf258..0a495ab35bc7 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -30,24 +30,35 @@ EXPORT_SYMBOL_GPL(context_tracking_enabled);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
-void context_tracking_cpu_set(int cpu)
+static bool context_tracking_recursion_enter(void)
{
- if (!per_cpu(context_tracking.active, cpu)) {
- per_cpu(context_tracking.active, cpu) = true;
- static_key_slow_inc(&context_tracking_enabled);
- }
+ int recursion;
+
+ recursion = __this_cpu_inc_return(context_tracking.recursion);
+ if (recursion == 1)
+ return true;
+
+ WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion);
+ __this_cpu_dec(context_tracking.recursion);
+
+ return false;
+}
+
+static void context_tracking_recursion_exit(void)
+{
+ __this_cpu_dec(context_tracking.recursion);
}
/**
- * context_tracking_user_enter - Inform the context tracking that the CPU is going to
- * enter userspace mode.
+ * context_tracking_enter - Inform the context tracking that the CPU is going
+ * enter user or guest space mode.
*
* This function must be called right before we switch from the kernel
- * to userspace, when it's guaranteed the remaining kernel instructions
- * to execute won't use any RCU read side critical section because this
- * function sets RCU in extended quiescent state.
+ * to user or guest space, when it's guaranteed the remaining kernel
+ * instructions to execute won't use any RCU read side critical section
+ * because this function sets RCU in extended quiescent state.
*/
-void context_tracking_user_enter(void)
+void context_tracking_enter(enum ctx_state state)
{
unsigned long flags;
@@ -75,9 +86,11 @@ void context_tracking_user_enter(void)
WARN_ON_ONCE(!current->mm);
local_irq_save(flags);
- if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+ if (!context_tracking_recursion_enter())
+ goto out_irq_restore;
+
+ if ( __this_cpu_read(context_tracking.state) != state) {
if (__this_cpu_read(context_tracking.active)) {
- trace_user_enter(0);
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
@@ -85,7 +98,10 @@ void context_tracking_user_enter(void)
* user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
- vtime_user_enter(current);
+ if (state == CONTEXT_USER) {
+ trace_user_enter(0);
+ vtime_user_enter(current);
+ }
rcu_user_enter();
}
/*
@@ -101,24 +117,34 @@ void context_tracking_user_enter(void)
* OTOH we can spare the calls to vtime and RCU when context_tracking.active
* is false because we know that CPU is not tickless.
*/
- __this_cpu_write(context_tracking.state, IN_USER);
+ __this_cpu_write(context_tracking.state, state);
}
+ context_tracking_recursion_exit();
+out_irq_restore:
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(context_tracking_enter);
+EXPORT_SYMBOL_GPL(context_tracking_enter);
+
+void context_tracking_user_enter(void)
+{
+ context_tracking_enter(CONTEXT_USER);
+}
NOKPROBE_SYMBOL(context_tracking_user_enter);
/**
- * context_tracking_user_exit - Inform the context tracking that the CPU is
- * exiting userspace mode and entering the kernel.
+ * context_tracking_exit - Inform the context tracking that the CPU is
+ * exiting user or guest mode and entering the kernel.
*
- * This function must be called after we entered the kernel from userspace
- * before any use of RCU read side critical section. This potentially include
- * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ * This function must be called after we entered the kernel from user or
+ * guest space before any use of RCU read side critical section. This
+ * potentially include any high level kernel code like syscalls, exceptions,
+ * signal handling, etc...
*
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void context_tracking_user_exit(void)
+void context_tracking_exit(enum ctx_state state)
{
unsigned long flags;
@@ -129,40 +155,56 @@ void context_tracking_user_exit(void)
return;
local_irq_save(flags);
- if (__this_cpu_read(context_tracking.state) == IN_USER) {
+ if (!context_tracking_recursion_enter())
+ goto out_irq_restore;
+
+ if (__this_cpu_read(context_tracking.state) == state) {
if (__this_cpu_read(context_tracking.active)) {
/*
* We are going to run code that may use RCU. Inform
* RCU core about that (ie: we may need the tick again).
*/
rcu_user_exit();
- vtime_user_exit(current);
- trace_user_exit(0);
+ if (state == CONTEXT_USER) {
+ vtime_user_exit(current);
+ trace_user_exit(0);
+ }
}
- __this_cpu_write(context_tracking.state, IN_KERNEL);
+ __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
+ context_tracking_recursion_exit();
+out_irq_restore:
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(context_tracking_exit);
+EXPORT_SYMBOL_GPL(context_tracking_exit);
+
+void context_tracking_user_exit(void)
+{
+ context_tracking_exit(CONTEXT_USER);
+}
NOKPROBE_SYMBOL(context_tracking_user_exit);
-/**
- * __context_tracking_task_switch - context switch the syscall callbacks
- * @prev: the task that is being switched out
- * @next: the task that is being switched in
- *
- * The context tracking uses the syscall slow path to implement its user-kernel
- * boundaries probes on syscalls. This way it doesn't impact the syscall fast
- * path on CPUs that don't do context tracking.
- *
- * But we need to clear the flag on the previous task because it may later
- * migrate to some CPU that doesn't do the context tracking. As such the TIF
- * flag may not be desired there.
- */
-void __context_tracking_task_switch(struct task_struct *prev,
- struct task_struct *next)
+void __init context_tracking_cpu_set(int cpu)
{
- clear_tsk_thread_flag(prev, TIF_NOHZ);
- set_tsk_thread_flag(next, TIF_NOHZ);
+ static __initdata bool initialized = false;
+
+ if (!per_cpu(context_tracking.active, cpu)) {
+ per_cpu(context_tracking.active, cpu) = true;
+ static_key_slow_inc(&context_tracking_enabled);
+ }
+
+ if (initialized)
+ return;
+
+ /*
+ * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork
+ * This assumes that init is the only task at this early boot stage.
+ */
+ set_tsk_thread_flag(&init_task, TIF_NOHZ);
+ WARN_ON_ONCE(!tasklist_empty());
+
+ initialized = true;
}
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1972b161c61e..82cf9dff4295 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,8 @@
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
+#include <linux/tick.h>
+#include <linux/irq.h>
#include <trace/events/power.h>
#include "smpboot.h"
@@ -189,21 +191,22 @@ void cpu_hotplug_done(void)
void cpu_hotplug_disable(void)
{
cpu_maps_update_begin();
- cpu_hotplug_disabled = 1;
+ cpu_hotplug_disabled++;
cpu_maps_update_done();
}
+EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
void cpu_hotplug_enable(void)
{
cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
+ WARN_ON(--cpu_hotplug_disabled < 0);
cpu_maps_update_done();
}
-
+EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
-int __ref register_cpu_notifier(struct notifier_block *nb)
+int register_cpu_notifier(struct notifier_block *nb)
{
int ret;
cpu_maps_update_begin();
@@ -212,7 +215,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
-int __ref __register_cpu_notifier(struct notifier_block *nb)
+int __register_cpu_notifier(struct notifier_block *nb)
{
return raw_notifier_chain_register(&cpu_chain, nb);
}
@@ -242,7 +245,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
EXPORT_SYMBOL(register_cpu_notifier);
EXPORT_SYMBOL(__register_cpu_notifier);
-void __ref unregister_cpu_notifier(struct notifier_block *nb)
+void unregister_cpu_notifier(struct notifier_block *nb)
{
cpu_maps_update_begin();
raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -250,7 +253,7 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
-void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+void __unregister_cpu_notifier(struct notifier_block *nb)
{
raw_notifier_chain_unregister(&cpu_chain, nb);
}
@@ -327,7 +330,7 @@ struct take_cpu_down_param {
};
/* Take this CPU down. */
-static int __ref take_cpu_down(void *_param)
+static int take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
int err;
@@ -338,13 +341,15 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Give up timekeeping duties */
+ tick_handover_do_timer();
/* Park the stopper thread */
kthread_park(current);
return 0;
}
/* Requires cpu_add_remove_lock to be held */
-static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
+static int _cpu_down(unsigned int cpu, int tasks_frozen)
{
int err, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
@@ -377,26 +382,31 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
* will observe it.
*
* For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so explicitly call both.
+ * not imply sync_sched(), so wait for both.
*
* Do sync before park smpboot threads to take care the rcu boost case.
*/
-#ifdef CONFIG_PREEMPT
- synchronize_sched();
-#endif
- synchronize_rcu();
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ else
+ synchronize_rcu();
smpboot_park_threads(cpu);
/*
- * So now all preempt/rcu users must observe !cpu_active().
+ * Prevent irq alloc/free while the dying cpu reorganizes the
+ * interrupt affinities.
*/
+ irq_lock_sparse();
- err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+ /*
+ * So now all preempt/rcu users must observe !cpu_active().
+ */
+ err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
- smpboot_unpark_threads(cpu);
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+ irq_unlock_sparse();
goto out_release;
}
BUG_ON(cpu_online(cpu));
@@ -408,13 +418,20 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
*
* Wait for the stop thread to go away.
*/
- while (!idle_cpu(cpu))
+ while (!per_cpu(cpu_dead_idle, cpu))
cpu_relax();
+ smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
+ per_cpu(cpu_dead_idle, cpu) = false;
+ /* Interrupts are moved away from the dying cpu, reenable alloc/free */
+ irq_unlock_sparse();
+
+ hotplug_cpu__broadcast_tick_pull(cpu);
/* This actually kills the CPU. */
__cpu_die(cpu);
/* CPU is completely dead: tell everyone. Too late to complain. */
+ tick_cleanup_dead_cpu(cpu);
cpu_notify_nofail(CPU_DEAD | mod, hcpu);
check_for_tasks(cpu);
@@ -426,7 +443,7 @@ out_release:
return err;
}
-int __ref cpu_down(unsigned int cpu)
+int cpu_down(unsigned int cpu)
{
int err;
@@ -446,6 +463,38 @@ out:
EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
+/*
+ * Unpark per-CPU smpboot kthreads at CPU-online time.
+ */
+static int smpboot_thread_call(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ smpboot_unpark_threads(cpu);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block smpboot_thread_notifier = {
+ .notifier_call = smpboot_thread_call,
+ .priority = CPU_PRI_SMPBOOT,
+};
+
+void smpboot_thread_init(void)
+{
+ register_cpu_notifier(&smpboot_thread_notifier);
+}
+
/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen)
{
@@ -481,13 +530,11 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
/* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
+
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
- /* Wake the per cpu threads */
- smpboot_unpark_threads(cpu);
-
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
@@ -562,13 +609,18 @@ int disable_nonboot_cpus(void)
}
}
- if (!error) {
+ if (!error)
BUG_ON(num_online_cpus() > 1);
- /* Make sure the CPUs won't be enabled by someone else */
- cpu_hotplug_disabled = 1;
- } else {
+ else
pr_err("Non-boot CPUs are not disabled\n");
- }
+
+ /*
+ * Make sure the CPUs won't be enabled by someone else. We need to do
+ * this even in case of failure as all disable_nonboot_cpus() users are
+ * supposed to do enable_nonboot_cpus() on the failure path.
+ */
+ cpu_hotplug_disabled++;
+
cpu_maps_update_done();
return error;
}
@@ -581,13 +633,13 @@ void __weak arch_enable_nonboot_cpus_end(void)
{
}
-void __ref enable_nonboot_cpus(void)
+void enable_nonboot_cpus(void)
{
int cpu, error;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
+ WARN_ON(--cpu_hotplug_disabled < 0);
if (cpumask_empty(frozen_cpus))
goto out;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fc7f4748d34a..f0acff0f66c9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
+ if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
+ goto done;
+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
- cpumask_copy(doms[0], top_cpuset.effective_cpus);
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ non_isolated_cpus);
goto done;
}
@@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
* the corresponding sched domain.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
- !is_sched_load_balance(cp))
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
continue;
if (is_sched_load_balance(cp))
@@ -748,6 +755,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
+ cpumask_and(dp, dp, non_isolated_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -760,6 +768,7 @@ restart:
BUG_ON(nslot != ndoms);
done:
+ free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
@@ -1214,7 +1223,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
spin_unlock_irq(&callback_lock);
/* use trialcs->mems_allowed as a temp variable */
- update_nodemasks_hier(cs, &cs->mems_allowed);
+ update_nodemasks_hier(cs, &trialcs->mems_allowed);
done:
return retval;
}
@@ -2444,20 +2453,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
- * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
- * set, yes, we can always allocate. If node is in our task's mems_allowed,
- * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
- * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
- * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
- * flag, yes.
+ * If we're in interrupt, yes, we can always allocate. If @node is set in
+ * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
+ * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
+ * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
* Otherwise, no.
*
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first. By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
* unless the task has been OOM killed as is marked TIF_MEMDIE.
@@ -2493,7 +2494,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
int allowed; /* is allocation in zone z allowed? */
unsigned long flags;
- if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+ if (in_interrupt())
return 1;
if (node_isset(node, current->mems_allowed))
return 1;
diff --git a/kernel/cred.c b/kernel/cred.c
index e0573a43c7df..ec1c07667ec1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -29,6 +29,9 @@
static struct kmem_cache *cred_jar;
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
/*
* The initial credentials for the initial task
*/
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2fabc0627165..e8183895691c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -34,14 +34,16 @@
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
+#include <linux/cgroup.h>
#include <linux/perf_event.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
-#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
#include "internal.h"
@@ -49,9 +51,11 @@
static struct workqueue_struct *perf_wq;
+typedef int (*remote_function_f)(void *);
+
struct remote_function_call {
struct task_struct *p;
- int (*func)(void *info);
+ remote_function_f func;
void *info;
int ret;
};
@@ -84,7 +88,7 @@ static void remote_function(void *data)
* -EAGAIN - when the process moved away
*/
static int
-task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+task_function_call(struct task_struct *p, remote_function_f func, void *info)
{
struct remote_function_call data = {
.p = p,
@@ -108,7 +112,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
*
* returns: @func return value or -ENXIO when the cpu is offline
*/
-static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+static int cpu_function_call(int cpu, remote_function_f func, void *info)
{
struct remote_function_call data = {
.p = NULL,
@@ -153,12 +157,13 @@ enum event_type_t {
*/
struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
+static atomic_t nr_switch_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -327,6 +332,11 @@ static inline u64 perf_clock(void)
return local_clock();
}
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+ return event->clock();
+}
+
static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
@@ -351,32 +361,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
#ifdef CONFIG_CGROUP_PERF
-/*
- * perf_cgroup_info keeps track of time_enabled for a cgroup.
- * This is a per-cpu dynamically allocated data structure.
- */
-struct perf_cgroup_info {
- u64 time;
- u64 timestamp;
-};
-
-struct perf_cgroup {
- struct cgroup_subsys_state css;
- struct perf_cgroup_info __percpu *info;
-};
-
-/*
- * Must ensure cgroup is pinned (css_get) before calling
- * this function. In other words, we cannot call this function
- * if there is no cgroup event for the current CPU context.
- */
-static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
-{
- return container_of(task_css(task, perf_event_cgrp_id),
- struct perf_cgroup, css);
-}
-
static inline bool
perf_cgroup_match(struct perf_event *event)
{
@@ -766,62 +750,31 @@ perf_cgroup_mark_enabled(struct perf_event *event,
/*
* function must be called with interrupts disbled
*/
-static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
struct perf_cpu_context *cpuctx;
- enum hrtimer_restart ret = HRTIMER_NORESTART;
int rotations = 0;
WARN_ON(!irqs_disabled());
cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-
rotations = perf_rotate_context(cpuctx);
- /*
- * arm timer if needed
- */
- if (rotations) {
+ raw_spin_lock(&cpuctx->hrtimer_lock);
+ if (rotations)
hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
- ret = HRTIMER_RESTART;
- }
-
- return ret;
-}
-
-/* CPU is going down */
-void perf_cpu_hrtimer_cancel(int cpu)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
- unsigned long flags;
-
- if (WARN_ON(cpu != smp_processor_id()))
- return;
-
- local_irq_save(flags);
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- if (pmu->task_ctx_nr == perf_sw_context)
- continue;
-
- hrtimer_cancel(&cpuctx->hrtimer);
- }
-
- rcu_read_unlock();
+ else
+ cpuctx->hrtimer_active = 0;
+ raw_spin_unlock(&cpuctx->hrtimer_lock);
- local_irq_restore(flags);
+ return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
{
- struct hrtimer *hr = &cpuctx->hrtimer;
+ struct hrtimer *timer = &cpuctx->hrtimer;
struct pmu *pmu = cpuctx->ctx.pmu;
- int timer;
+ u64 interval;
/* no multiplexing needed for SW PMU */
if (pmu->task_ctx_nr == perf_sw_context)
@@ -831,31 +784,36 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
* check default is sane, if not set then force to
* default interval (1/tick)
*/
- timer = pmu->hrtimer_interval_ms;
- if (timer < 1)
- timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+ interval = pmu->hrtimer_interval_ms;
+ if (interval < 1)
+ interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- hr->function = perf_cpu_hrtimer_handler;
+ raw_spin_lock_init(&cpuctx->hrtimer_lock);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ timer->function = perf_mux_hrtimer_handler;
}
-static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
{
- struct hrtimer *hr = &cpuctx->hrtimer;
+ struct hrtimer *timer = &cpuctx->hrtimer;
struct pmu *pmu = cpuctx->ctx.pmu;
+ unsigned long flags;
/* not for SW PMU */
if (pmu->task_ctx_nr == perf_sw_context)
- return;
+ return 0;
- if (hrtimer_active(hr))
- return;
+ raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+ if (!cpuctx->hrtimer_active) {
+ cpuctx->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ }
+ raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
- if (!hrtimer_callback_running(hr))
- __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
- 0, HRTIMER_MODE_REL_PINNED, 0);
+ return 0;
}
void perf_pmu_disable(struct pmu *pmu)
@@ -905,6 +863,15 @@ static void get_ctx(struct perf_event_context *ctx)
WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
}
+static void free_ctx(struct rcu_head *head)
+{
+ struct perf_event_context *ctx;
+
+ ctx = container_of(head, struct perf_event_context, rcu_head);
+ kfree(ctx->task_ctx_data);
+ kfree(ctx);
+}
+
static void put_ctx(struct perf_event_context *ctx)
{
if (atomic_dec_and_test(&ctx->refcount)) {
@@ -912,7 +879,7 @@ static void put_ctx(struct perf_event_context *ctx)
put_ctx(ctx->parent_ctx);
if (ctx->task)
put_task_struct(ctx->task);
- kfree_rcu(ctx, rcu_head);
+ call_rcu(&ctx->rcu_head, free_ctx);
}
}
@@ -923,10 +890,30 @@ static void put_ctx(struct perf_event_context *ctx)
* Those places that change perf_event::ctx will hold both
* perf_event_ctx::mutex of the 'old' and 'new' ctx value.
*
- * Lock ordering is by mutex address. There is one other site where
- * perf_event_context::mutex nests and that is put_event(). But remember that
- * that is a parent<->child context relation, and migration does not affect
- * children, therefore these two orderings should not interact.
+ * Lock ordering is by mutex address. There are two other sites where
+ * perf_event_context::mutex nests and those are:
+ *
+ * - perf_event_exit_task_context() [ child , 0 ]
+ * __perf_event_exit_task()
+ * sync_child_event()
+ * put_event() [ parent, 1 ]
+ *
+ * - perf_event_init_context() [ parent, 0 ]
+ * inherit_task_group()
+ * inherit_group()
+ * inherit_event()
+ * perf_event_alloc()
+ * perf_init_event()
+ * perf_try_init_event() [ child , 1 ]
+ *
+ * While it appears there is an obvious deadlock here -- the parent and child
+ * nesting levels are inverted between the two. This is in fact safe because
+ * life-time rules separate them. That is an exiting task cannot fork, and a
+ * spawning task cannot (yet) exit.
+ *
+ * But remember that that these are parent<->child context relations, and
+ * migration does not affect children, therefore these two orderings should not
+ * interact.
*
* The change in perf_event::ctx does not affect children (as claimed above)
* because the sys_perf_event_open() case will install a new event and break
@@ -1239,9 +1226,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event))
ctx->nr_cgroups++;
- if (has_branch_stack(event))
- ctx->nr_branch_stack++;
-
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
@@ -1408,9 +1392,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
cpuctx->cgrp = NULL;
}
- if (has_branch_stack(event))
- ctx->nr_branch_stack--;
-
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1522,11 +1503,17 @@ static int __init perf_workqueue_init(void)
core_initcall(perf_workqueue_init);
+static inline int pmu_filter_match(struct perf_event *event)
+{
+ struct pmu *pmu = event->pmu;
+ return pmu->filter_match ? pmu->filter_match(event) : 1;
+}
+
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id())
- && perf_cgroup_match(event);
+ && perf_cgroup_match(event) && pmu_filter_match(event);
}
static void
@@ -1847,6 +1834,7 @@ static void perf_set_shadow_time(struct perf_event *event,
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_event *event, int enable);
+static void perf_log_itrace_start(struct perf_event *event);
static int
event_sched_in(struct perf_event *event,
@@ -1881,6 +1869,10 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
+ perf_set_shadow_time(event, ctx, tstamp);
+
+ perf_log_itrace_start(event);
+
if (event->pmu->add(event, PERF_EF_START)) {
event->state = PERF_EVENT_STATE_INACTIVE;
event->oncpu = -1;
@@ -1890,8 +1882,6 @@ event_sched_in(struct perf_event *event,
event->tstamp_running += tstamp - event->tstamp_stopped;
- perf_set_shadow_time(event, ctx, tstamp);
-
if (!is_software_event(event))
cpuctx->active_oncpu++;
if (!ctx->nr_active++)
@@ -1928,7 +1918,7 @@ group_sched_in(struct perf_event *group_event,
if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
- perf_cpu_hrtimer_restart(cpuctx);
+ perf_mux_hrtimer_restart(cpuctx);
return -EAGAIN;
}
@@ -1975,7 +1965,7 @@ group_error:
pmu->cancel_txn(pmu);
- perf_cpu_hrtimer_restart(cpuctx);
+ perf_mux_hrtimer_restart(cpuctx);
return -EAGAIN;
}
@@ -2248,7 +2238,7 @@ static int __perf_event_enable(void *info)
*/
if (leader != event) {
group_sched_out(leader, cpuctx, ctx);
- perf_cpu_hrtimer_restart(cpuctx);
+ perf_mux_hrtimer_restart(cpuctx);
}
if (leader->attr.pinned) {
update_group_times(leader);
@@ -2559,6 +2549,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
next->perf_event_ctxp[ctxn] = ctx;
ctx->task = next;
next_ctx->task = task;
+
+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+
do_switch = 0;
perf_event_sync_stat(ctx, next_ctx);
@@ -2577,6 +2570,59 @@ unlock:
}
}
+void perf_sched_cb_dec(struct pmu *pmu)
+{
+ this_cpu_dec(perf_sched_cb_usages);
+}
+
+void perf_sched_cb_inc(struct pmu *pmu)
+{
+ this_cpu_inc(perf_sched_cb_usages);
+}
+
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (prev == next)
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (pmu->sched_task) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ perf_pmu_disable(pmu);
+
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+ perf_pmu_enable(pmu);
+
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static void perf_event_switch(struct task_struct *task,
+ struct task_struct *next_prev, bool sched_in);
+
#define for_each_task_context_nr(ctxn) \
for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
@@ -2596,6 +2642,12 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
+ if (atomic_read(&nr_switch_events))
+ perf_event_switch(task, next, false);
+
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);
@@ -2755,64 +2807,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
}
/*
- * When sampling the branck stack in system-wide, it may be necessary
- * to flush the stack on context switch. This happens when the branch
- * stack does not tag its entries with the pid of the current task.
- * Otherwise it becomes impossible to associate a branch entry with a
- * task. This ambiguity is more likely to appear when the branch stack
- * supports priv level filtering and the user sets it to monitor only
- * at the user level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch stack with
- * branch from multiple tasks. Flushing may mean dropping the existing
- * entries or stashing them somewhere in the PMU specific code layer.
- *
- * This function provides the context switch callback to the lower code
- * layer. It is invoked ONLY when there is at least one system-wide context
- * with at least one active event using taken branch sampling.
- */
-static void perf_branch_stack_sched_in(struct task_struct *prev,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
- unsigned long flags;
-
- /* no need to flush branch stack if not changing task */
- if (prev == task)
- return;
-
- local_irq_save(flags);
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- /*
- * check if the context has at least one
- * event using PERF_SAMPLE_BRANCH_STACK
- */
- if (cpuctx->ctx.nr_branch_stack > 0
- && pmu->flush_branch_stack) {
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-
- perf_pmu_disable(pmu);
-
- pmu->flush_branch_stack();
-
- perf_pmu_enable(pmu);
-
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
- }
-
- rcu_read_unlock();
-
- local_irq_restore(flags);
-}
-
-/*
* Called from scheduler to add the events of the current task
* with interrupts disabled.
*
@@ -2844,9 +2838,11 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);
- /* check for system-wide branch_stack events */
- if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
- perf_branch_stack_sched_in(prev, task);
+ if (atomic_read(&nr_switch_events))
+ perf_event_switch(task, prev, true);
+
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -3220,7 +3216,63 @@ static void __perf_event_read(void *info)
static inline u64 perf_event_count(struct perf_event *event)
{
- return local64_read(&event->count) + atomic64_read(&event->child_count);
+ if (event->pmu->count)
+ return event->pmu->count(event);
+
+ return __perf_event_count(event);
+}
+
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ * - either for the current task, or for this CPU
+ * - does not have inherit set, for inherited task events
+ * will not be local and we cannot read them atomically
+ * - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+ unsigned long flags;
+ u64 val;
+
+ /*
+ * Disabling interrupts avoids all counter scheduling (context
+ * switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
+
+ /* If this is a per-task event, it must be for current */
+ WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current);
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+ event->cpu != smp_processor_id());
+
+ /*
+ * It must not be an event with inherit set, we cannot read
+ * all child counters from atomic context.
+ */
+ WARN_ON_ONCE(event->attr.inherit);
+
+ /*
+ * It must not have a pmu::count method, those are not
+ * NMI safe.
+ */
+ WARN_ON_ONCE(event->pmu->count);
+
+ /*
+ * If the event is currently on this CPU, its either a per-task event,
+ * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+ * oncpu == -1).
+ */
+ if (event->oncpu == smp_processor_id())
+ event->pmu->read(event);
+
+ val = local64_read(&event->count);
+ local_irq_restore(flags);
+
+ return val;
}
static u64 perf_event_read(struct perf_event *event)
@@ -3321,12 +3373,15 @@ errout:
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+find_get_context(struct pmu *pmu, struct task_struct *task,
+ struct perf_event *event)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
+ void *task_ctx_data = NULL;
unsigned long flags;
int ctxn, err;
+ int cpu = event->cpu;
if (!task) {
/* Must be root to operate on a CPU event: */
@@ -3354,11 +3409,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
if (ctxn < 0)
goto errout;
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+ if (!task_ctx_data) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ }
+
retry:
ctx = perf_lock_task_context(task, ctxn, &flags);
if (ctx) {
clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
+
+ if (task_ctx_data && !ctx->task_ctx_data) {
+ ctx->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ }
raw_spin_unlock_irqrestore(&ctx->lock, flags);
if (clone_ctx)
@@ -3369,6 +3437,11 @@ retry:
if (!ctx)
goto errout;
+ if (task_ctx_data) {
+ ctx->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ }
+
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -3395,13 +3468,16 @@ retry:
}
}
+ kfree(task_ctx_data);
return ctx;
errout:
+ kfree(task_ctx_data);
return ERR_PTR(err);
}
static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
@@ -3414,7 +3490,6 @@ static void free_event_rcu(struct rcu_head *head)
kfree(event);
}
-static void ring_buffer_put(struct ring_buffer *rb);
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb);
@@ -3423,10 +3498,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
if (event->parent)
return;
- if (has_branch_stack(event)) {
- if (!(event->attach_state & PERF_ATTACH_TASK))
- atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
- }
if (is_cgroup_event(event))
atomic_dec(&per_cpu(perf_cgroup_events, cpu));
}
@@ -3446,6 +3517,10 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_task_events);
if (event->attr.freq)
atomic_dec(&nr_freq_events);
+ if (event->attr.context_switch) {
+ static_key_slow_dec_deferred(&perf_sched_events);
+ atomic_dec(&nr_switch_events);
+ }
if (is_cgroup_event(event))
static_key_slow_dec_deferred(&perf_sched_events);
if (has_branch_stack(event))
@@ -3454,6 +3529,91 @@ static void unaccount_event(struct perf_event *event)
unaccount_event_cpu(event, event->cpu);
}
+/*
+ * The following implement mutual exclusion of events on "exclusive" pmus
+ * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
+ * at a time, so we disallow creating events that might conflict, namely:
+ *
+ * 1) cpu-wide events in the presence of per-task events,
+ * 2) per-task events in the presence of cpu-wide events,
+ * 3) two matching events on the same context.
+ *
+ * The former two cases are handled in the allocation path (perf_event_alloc(),
+ * __free_event()), the latter -- before the first perf_install_in_context().
+ */
+static int exclusive_event_init(struct perf_event *event)
+{
+ struct pmu *pmu = event->pmu;
+
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ return 0;
+
+ /*
+ * Prevent co-existence of per-task and cpu-wide events on the
+ * same exclusive pmu.
+ *
+ * Negative pmu::exclusive_cnt means there are cpu-wide
+ * events on this "exclusive" pmu, positive means there are
+ * per-task events.
+ *
+ * Since this is called in perf_event_alloc() path, event::ctx
+ * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
+ * to mean "per-task event", because unlike other attach states it
+ * never gets cleared.
+ */
+ if (event->attach_state & PERF_ATTACH_TASK) {
+ if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
+ return -EBUSY;
+ } else {
+ if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static void exclusive_event_destroy(struct perf_event *event)
+{
+ struct pmu *pmu = event->pmu;
+
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ return;
+
+ /* see comment in exclusive_event_init() */
+ if (event->attach_state & PERF_ATTACH_TASK)
+ atomic_dec(&pmu->exclusive_cnt);
+ else
+ atomic_inc(&pmu->exclusive_cnt);
+}
+
+static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
+{
+ if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+ (e1->cpu == e2->cpu ||
+ e1->cpu == -1 ||
+ e2->cpu == -1))
+ return true;
+ return false;
+}
+
+/* Called under the same ctx::mutex as perf_install_in_context() */
+static bool exclusive_event_installable(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *iter_event;
+ struct pmu *pmu = event->pmu;
+
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ return true;
+
+ list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
+ if (exclusive_event_match(iter_event, event))
+ return false;
+ }
+
+ return true;
+}
+
static void __free_event(struct perf_event *event)
{
if (!event->parent) {
@@ -3461,14 +3621,18 @@ static void __free_event(struct perf_event *event)
put_callchain_buffers();
}
+ perf_event_free_bpf_prog(event);
+
if (event->destroy)
event->destroy(event);
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu)
+ if (event->pmu) {
+ exclusive_event_destroy(event);
module_put(event->pmu->module);
+ }
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -3563,9 +3727,6 @@ static void perf_remove_from_owner(struct perf_event *event)
}
}
-/*
- * Called when the last reference to the file is gone.
- */
static void put_event(struct perf_event *event)
{
struct perf_event_context *ctx;
@@ -3603,6 +3764,9 @@ int perf_event_release_kernel(struct perf_event *event)
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+/*
+ * Called when the last reference to the file is gone.
+ */
static int perf_release(struct inode *inode, struct file *file)
{
put_event(file->private_data);
@@ -3861,28 +4025,21 @@ static void perf_event_for_each(struct perf_event *event,
perf_event_for_each_child(sibling, func);
}
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
-{
- struct perf_event_context *ctx = event->ctx;
- int ret = 0, active;
+struct period_event {
+ struct perf_event *event;
u64 value;
+};
- if (!is_sampling_event(event))
- return -EINVAL;
-
- if (copy_from_user(&value, arg, sizeof(value)))
- return -EFAULT;
-
- if (!value)
- return -EINVAL;
+static int __perf_event_period(void *info)
+{
+ struct period_event *pe = info;
+ struct perf_event *event = pe->event;
+ struct perf_event_context *ctx = event->ctx;
+ u64 value = pe->value;
+ bool active;
- raw_spin_lock_irq(&ctx->lock);
+ raw_spin_lock(&ctx->lock);
if (event->attr.freq) {
- if (value > sysctl_perf_event_sample_rate) {
- ret = -EINVAL;
- goto unlock;
- }
-
event->attr.sample_freq = value;
} else {
event->attr.sample_period = value;
@@ -3901,11 +4058,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
event->pmu->start(event, PERF_EF_RELOAD);
perf_pmu_enable(ctx->pmu);
}
+ raw_spin_unlock(&ctx->lock);
-unlock:
+ return 0;
+}
+
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+ struct period_event pe = { .event = event, };
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task;
+ u64 value;
+
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ if (copy_from_user(&value, arg, sizeof(value)))
+ return -EFAULT;
+
+ if (!value)
+ return -EINVAL;
+
+ if (event->attr.freq && value > sysctl_perf_event_sample_rate)
+ return -EINVAL;
+
+ task = ctx->task;
+ pe.value = value;
+
+ if (!task) {
+ cpu_function_call(event->cpu, __perf_event_period, &pe);
+ return 0;
+ }
+
+retry:
+ if (!task_function_call(task, __perf_event_period, &pe))
+ return 0;
+
+ raw_spin_lock_irq(&ctx->lock);
+ if (ctx->is_active) {
+ raw_spin_unlock_irq(&ctx->lock);
+ task = ctx->task;
+ goto retry;
+ }
+
+ __perf_event_period(&pe);
raw_spin_unlock_irq(&ctx->lock);
- return ret;
+ return 0;
}
static const struct file_operations perf_fops;
@@ -3927,6 +4126,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
@@ -3980,6 +4180,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);
+ case PERF_EVENT_IOC_SET_BPF:
+ return perf_event_set_bpf_prog(event, arg);
+
default:
return -ENOTTY;
}
@@ -4096,6 +4299,8 @@ static void perf_event_init_userpage(struct perf_event *event)
/* Allow new userspace to detect that bit 0 is deprecated */
userpg->cap_bit0_is_deprecated = 1;
userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+ userpg->data_offset = PAGE_SIZE;
+ userpg->data_size = perf_data_size(rb);
unlock:
rcu_read_unlock();
@@ -4210,20 +4415,20 @@ static void ring_buffer_attach(struct perf_event *event,
WARN_ON_ONCE(event->rcu_pending);
old_rb = event->rb;
- event->rcu_batches = get_state_synchronize_rcu();
- event->rcu_pending = 1;
-
spin_lock_irqsave(&old_rb->event_lock, flags);
list_del_rcu(&event->rb_entry);
spin_unlock_irqrestore(&old_rb->event_lock, flags);
- }
- if (event->rcu_pending && rb) {
- cond_synchronize_rcu(event->rcu_batches);
- event->rcu_pending = 0;
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
}
if (rb) {
+ if (event->rcu_pending) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
+
spin_lock_irqsave(&rb->event_lock, flags);
list_add_rcu(&event->rb_entry, &rb->event_list);
spin_unlock_irqrestore(&rb->event_lock, flags);
@@ -4255,15 +4460,7 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_unlock();
}
-static void rb_free_rcu(struct rcu_head *rcu_head)
-{
- struct ring_buffer *rb;
-
- rb = container_of(rcu_head, struct ring_buffer, rcu_head);
- rb_free(rb);
-}
-
-static struct ring_buffer *ring_buffer_get(struct perf_event *event)
+struct ring_buffer *ring_buffer_get(struct perf_event *event)
{
struct ring_buffer *rb;
@@ -4278,7 +4475,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
return rb;
}
-static void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct ring_buffer *rb)
{
if (!atomic_dec_and_test(&rb->refcount))
return;
@@ -4295,6 +4492,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
+ if (vma->vm_pgoff)
+ atomic_inc(&event->rb->aux_mmap_count);
+
if (event->pmu->event_mapped)
event->pmu->event_mapped(event);
}
@@ -4319,6 +4519,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event);
+ /*
+ * rb->aux_mmap_count will always drop before rb->mmap_count and
+ * event->mmap_count, so it is ok to use event->mmap_mutex to
+ * serialize with perf_mmap here.
+ */
+ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
+ atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+
+ rb_free_aux(rb);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
atomic_dec(&rb->mmap_count);
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4392,7 +4606,7 @@ out_put:
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
- .close = perf_mmap_close,
+ .close = perf_mmap_close, /* non mergable */
.fault = perf_mmap_fault,
.page_mkwrite = perf_mmap_fault,
};
@@ -4403,10 +4617,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
unsigned long locked, lock_limit;
- struct ring_buffer *rb;
+ struct ring_buffer *rb = NULL;
unsigned long vma_size;
unsigned long nr_pages;
- long user_extra, extra;
+ long user_extra = 0, extra = 0;
int ret = 0, flags = 0;
/*
@@ -4421,7 +4635,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
vma_size = vma->vm_end - vma->vm_start;
- nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+ if (vma->vm_pgoff == 0) {
+ nr_pages = (vma_size / PAGE_SIZE) - 1;
+ } else {
+ /*
+ * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+ * mapped, all subsequent mappings should have the same size
+ * and offset. Must be above the normal perf buffer.
+ */
+ u64 aux_offset, aux_size;
+
+ if (!event->rb)
+ return -EINVAL;
+
+ nr_pages = vma_size / PAGE_SIZE;
+
+ mutex_lock(&event->mmap_mutex);
+ ret = -EINVAL;
+
+ rb = event->rb;
+ if (!rb)
+ goto aux_unlock;
+
+ aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
+ aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+
+ if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+ goto aux_unlock;
+
+ if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+ goto aux_unlock;
+
+ /* already mapped with a different offset */
+ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+ goto aux_unlock;
+
+ if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
+ goto aux_unlock;
+
+ /* already mapped with a different size */
+ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+ goto aux_unlock;
+
+ if (!is_power_of_2(nr_pages))
+ goto aux_unlock;
+
+ if (!atomic_inc_not_zero(&rb->mmap_count))
+ goto aux_unlock;
+
+ if (rb_has_aux(rb)) {
+ atomic_inc(&rb->aux_mmap_count);
+ ret = 0;
+ goto unlock;
+ }
+
+ atomic_set(&rb->aux_mmap_count, 1);
+ user_extra = nr_pages;
+
+ goto accounting;
+ }
/*
* If we have rb pages ensure they're a power-of-two number, so we
@@ -4433,9 +4706,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (vma_size != PAGE_SIZE * (1 + nr_pages))
return -EINVAL;
- if (vma->vm_pgoff != 0)
- return -EINVAL;
-
WARN_ON_ONCE(event->ctx->parent_ctx);
again:
mutex_lock(&event->mmap_mutex);
@@ -4459,6 +4729,8 @@ again:
}
user_extra = nr_pages + 1;
+
+accounting:
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
/*
@@ -4468,7 +4740,6 @@ again:
user_locked = atomic_long_read(&user->locked_vm) + user_extra;
- extra = 0;
if (user_locked > user_lock_limit)
extra = user_locked - user_lock_limit;
@@ -4482,35 +4753,46 @@ again:
goto unlock;
}
- WARN_ON(event->rb);
+ WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
flags |= RING_BUFFER_WRITABLE;
- rb = rb_alloc(nr_pages,
- event->attr.watermark ? event->attr.wakeup_watermark : 0,
- event->cpu, flags);
-
if (!rb) {
- ret = -ENOMEM;
- goto unlock;
- }
+ rb = rb_alloc(nr_pages,
+ event->attr.watermark ? event->attr.wakeup_watermark : 0,
+ event->cpu, flags);
- atomic_set(&rb->mmap_count, 1);
- rb->mmap_locked = extra;
- rb->mmap_user = get_current_user();
+ if (!rb) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
- atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->pinned_vm += extra;
+ atomic_set(&rb->mmap_count, 1);
+ rb->mmap_user = get_current_user();
+ rb->mmap_locked = extra;
- ring_buffer_attach(event, rb);
+ ring_buffer_attach(event, rb);
- perf_event_init_userpage(event);
- perf_event_update_userpage(event);
+ perf_event_init_userpage(event);
+ perf_event_update_userpage(event);
+ } else {
+ ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
+ event->attr.aux_watermark, flags);
+ if (!ret)
+ rb->aux_mmap_locked = extra;
+ }
unlock:
- if (!ret)
+ if (!ret) {
+ atomic_long_add(user_extra, &user->locked_vm);
+ vma->vm_mm->pinned_vm += extra;
+
atomic_inc(&event->mmap_count);
+ } else if (rb) {
+ atomic_dec(&rb->mmap_count);
+ }
+aux_unlock:
mutex_unlock(&event->mmap_mutex);
/*
@@ -4560,12 +4842,20 @@ static const struct file_operations perf_fops = {
* to user-space before waking everybody up.
*/
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+ /* only the parent has fasync state */
+ if (event->parent)
+ event = event->parent;
+ return &event->fasync;
+}
+
void perf_event_wakeup(struct perf_event *event)
{
ring_buffer_wakeup(event);
if (event->pending_kill) {
- kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+ kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
event->pending_kill = 0;
}
}
@@ -4766,7 +5056,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
}
if (sample_type & PERF_SAMPLE_TIME)
- data->time = perf_clock();
+ data->time = perf_event_clock(event);
if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
data->id = primary_event_id(event);
@@ -5175,9 +5465,9 @@ void perf_prepare_sample(struct perf_event_header *header,
}
}
-static void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+void perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct perf_output_handle handle;
struct perf_event_header header;
@@ -5344,6 +5634,8 @@ static void perf_event_task_output(struct perf_event *event,
task_event->event_id.tid = perf_event_tid(event, task);
task_event->event_id.ptid = perf_event_tid(event, current);
+ task_event->event_id.time = perf_event_clock(event);
+
perf_output_put(&handle, task_event->event_id);
perf_event__output_id_sample(event, &handle, &sample);
@@ -5377,7 +5669,7 @@ static void perf_event_task(struct task_struct *task,
/* .ppid */
/* .tid */
/* .ptid */
- .time = perf_clock(),
+ /* .time */
},
};
@@ -5604,7 +5896,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+ name = file_path(file, buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
name = "//toolong";
goto cpy_name;
@@ -5732,6 +6024,158 @@ void perf_event_mmap(struct vm_area_struct *vma)
perf_event_mmap_event(&mmap_event);
}
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+ unsigned long size, u64 flags)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_aux_event {
+ struct perf_event_header header;
+ u64 offset;
+ u64 size;
+ u64 flags;
+ } rec = {
+ .header = {
+ .type = PERF_RECORD_AUX,
+ .misc = 0,
+ .size = sizeof(rec),
+ },
+ .offset = head,
+ .size = size,
+ .flags = flags,
+ };
+ int ret;
+
+ perf_event_header__init_id(&rec.header, &sample, event);
+ ret = perf_output_begin(&handle, event, rec.header.size);
+
+ if (ret)
+ return;
+
+ perf_output_put(&handle, rec);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+/*
+ * Lost/dropped samples logging
+ */
+void perf_log_lost_samples(struct perf_event *event, u64 lost)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ u64 lost;
+ } lost_samples_event = {
+ .header = {
+ .type = PERF_RECORD_LOST_SAMPLES,
+ .misc = 0,
+ .size = sizeof(lost_samples_event),
+ },
+ .lost = lost,
+ };
+
+ perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ lost_samples_event.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, lost_samples_event);
+ perf_event__output_id_sample(event, &handle, &sample);
+ perf_output_end(&handle);
+}
+
+/*
+ * context_switch tracking
+ */
+
+struct perf_switch_event {
+ struct task_struct *task;
+ struct task_struct *next_prev;
+
+ struct {
+ struct perf_event_header header;
+ u32 next_prev_pid;
+ u32 next_prev_tid;
+ } event_id;
+};
+
+static int perf_event_switch_match(struct perf_event *event)
+{
+ return event->attr.context_switch;
+}
+
+static void perf_event_switch_output(struct perf_event *event, void *data)
+{
+ struct perf_switch_event *se = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_switch_match(event))
+ return;
+
+ /* Only CPU-wide events are allowed to see next/prev pid/tid */
+ if (event->ctx->task) {
+ se->event_id.header.type = PERF_RECORD_SWITCH;
+ se->event_id.header.size = sizeof(se->event_id.header);
+ } else {
+ se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
+ se->event_id.header.size = sizeof(se->event_id);
+ se->event_id.next_prev_pid =
+ perf_event_pid(event, se->next_prev);
+ se->event_id.next_prev_tid =
+ perf_event_tid(event, se->next_prev);
+ }
+
+ perf_event_header__init_id(&se->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event, se->event_id.header.size);
+ if (ret)
+ return;
+
+ if (event->ctx->task)
+ perf_output_put(&handle, se->event_id.header);
+ else
+ perf_output_put(&handle, se->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_event_switch(struct task_struct *task,
+ struct task_struct *next_prev, bool sched_in)
+{
+ struct perf_switch_event switch_event;
+
+ /* N.B. caller checks nr_switch_events != 0 */
+
+ switch_event = (struct perf_switch_event){
+ .task = task,
+ .next_prev = next_prev,
+ .event_id = {
+ .header = {
+ /* .type */
+ .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
+ /* .size */
+ },
+ /* .next_prev_pid */
+ /* .next_prev_tid */
+ },
+ };
+
+ perf_event_aux(perf_event_switch_output,
+ &switch_event,
+ NULL);
+}
+
/*
* IRQ throttle logging
*/
@@ -5753,7 +6197,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
.misc = 0,
.size = sizeof(throttle_event),
},
- .time = perf_clock(),
+ .time = perf_event_clock(event),
.id = primary_event_id(event),
.stream_id = event->id,
};
@@ -5773,6 +6217,42 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+static void perf_log_itrace_start(struct perf_event *event)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_aux_event {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+ } rec;
+ int ret;
+
+ if (event->parent)
+ event = event->parent;
+
+ if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
+ event->hw.itrace_started)
+ return;
+
+ rec.header.type = PERF_RECORD_ITRACE_START;
+ rec.header.misc = 0;
+ rec.header.size = sizeof(rec);
+ rec.pid = perf_event_pid(event, current);
+ rec.tid = perf_event_tid(event, current);
+
+ perf_event_header__init_id(&rec.header, &sample, event);
+ ret = perf_output_begin(&handle, event, rec.header.size);
+
+ if (ret)
+ return;
+
+ perf_output_put(&handle, rec);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
/*
* Generic event overflow handling, sampling.
*/
@@ -5837,7 +6317,7 @@ static int __perf_event_overflow(struct perf_event *event,
else
perf_event_output(event, data, regs);
- if (event->fasync && event->pending_kill) {
+ if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
irq_work_queue(&event->pending);
}
@@ -6133,6 +6613,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
}
hlist_add_head_rcu(&event->hlist_entry, head);
+ perf_event_update_userpage(event);
return 0;
}
@@ -6296,6 +6777,8 @@ static int perf_swevent_init(struct perf_event *event)
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
.event_init = perf_swevent_init,
.add = perf_swevent_add,
.del = perf_swevent_del,
@@ -6449,6 +6932,49 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event);
}
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ struct bpf_prog *prog;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ if (event->tp_event->prog)
+ return -EEXIST;
+
+ if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
+ /* bpf programs can only be attached to u/kprobes */
+ return -EINVAL;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type != BPF_PROG_TYPE_KPROBE) {
+ /* valid fd, but invalid bpf program type */
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ event->tp_event->prog = prog;
+
+ return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog *prog;
+
+ if (!event->tp_event)
+ return;
+
+ prog = event->tp_event->prog;
+ if (prog) {
+ event->tp_event->prog = NULL;
+ bpf_prog_put(prog);
+ }
+}
+
#else
static inline void perf_tp_register(void)
@@ -6464,6 +6990,14 @@ static void perf_event_free_filter(struct perf_event *event)
{
}
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
#endif /* CONFIG_EVENT_TRACING */
#ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -6530,9 +7064,8 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
} else {
period = max_t(u64, 10000, hwc->sample_period);
}
- __hrtimer_start_range_ns(&hwc->hrtimer,
- ns_to_ktime(period), 0,
- HRTIMER_MODE_REL_PINNED, 0);
+ hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
+ HRTIMER_MODE_REL_PINNED);
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@ -6602,6 +7135,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
{
if (flags & PERF_EF_START)
cpu_clock_event_start(event, flags);
+ perf_event_update_userpage(event);
return 0;
}
@@ -6638,6 +7172,8 @@ static int cpu_clock_event_init(struct perf_event *event)
static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
.del = cpu_clock_event_del,
@@ -6676,6 +7212,7 @@ static int task_clock_event_add(struct perf_event *event, int flags)
{
if (flags & PERF_EF_START)
task_clock_event_start(event, flags);
+ perf_event_update_userpage(event);
return 0;
}
@@ -6716,6 +7253,8 @@ static int task_clock_event_init(struct perf_event *event)
static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
.event_init = task_clock_event_init,
.add = task_clock_event_add,
.del = task_clock_event_del,
@@ -6827,6 +7366,8 @@ perf_event_mux_interval_ms_show(struct device *dev,
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
}
+static DEFINE_MUTEX(mux_interval_mutex);
+
static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
struct device_attribute *attr,
@@ -6846,17 +7387,21 @@ perf_event_mux_interval_ms_store(struct device *dev,
if (timer == pmu->hrtimer_interval_ms)
return count;
+ mutex_lock(&mux_interval_mutex);
pmu->hrtimer_interval_ms = timer;
/* update all cpuctx for this PMU */
- for_each_possible_cpu(cpu) {
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- if (hrtimer_active(&cpuctx->hrtimer))
- hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ cpu_function_call(cpu,
+ (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
}
+ put_online_cpus();
+ mutex_unlock(&mux_interval_mutex);
return count;
}
@@ -6961,7 +7506,7 @@ skip_type:
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
- __perf_cpu_hrtimer_init(cpuctx, cpu);
+ __perf_mux_hrtimer_init(cpuctx, cpu);
cpuctx->unique_pmu = pmu;
}
@@ -6993,6 +7538,7 @@ got_cpu_context:
pmu->event_idx = perf_event_idx_default;
list_add_rcu(&pmu->entry, &pmus);
+ atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
mutex_unlock(&pmus_lock);
@@ -7037,12 +7583,28 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister);
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
+ struct perf_event_context *ctx = NULL;
int ret;
if (!try_module_get(pmu->module))
return -ENODEV;
+
+ if (event->group_leader != event) {
+ /*
+ * This ctx->mutex can nest when we're called through
+ * inheritance. See the perf_event_ctx_lock_nested() comment.
+ */
+ ctx = perf_event_ctx_lock_nested(event->group_leader,
+ SINGLE_DEPTH_NESTING);
+ BUG_ON(!ctx);
+ }
+
event->pmu = pmu;
ret = pmu->event_init(event);
+
+ if (ctx)
+ perf_event_ctx_unlock(event->group_leader, ctx);
+
if (ret)
module_put(pmu->module);
@@ -7089,10 +7651,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
if (event->parent)
return;
- if (has_branch_stack(event)) {
- if (!(event->attach_state & PERF_ATTACH_TASK))
- atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
- }
if (is_cgroup_event(event))
atomic_inc(&per_cpu(perf_cgroup_events, cpu));
}
@@ -7114,6 +7672,10 @@ static void account_event(struct perf_event *event)
if (atomic_inc_return(&nr_freq_events) == 1)
tick_nohz_full_kick_all();
}
+ if (event->attr.context_switch) {
+ atomic_inc(&nr_switch_events);
+ static_key_slow_inc(&perf_sched_events.key);
+ }
if (has_branch_stack(event))
static_key_slow_inc(&perf_sched_events.key);
if (is_cgroup_event(event))
@@ -7131,7 +7693,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct perf_event *group_leader,
struct perf_event *parent_event,
perf_overflow_handler_t overflow_handler,
- void *context)
+ void *context, int cgroup_fd)
{
struct pmu *pmu;
struct perf_event *event;
@@ -7186,18 +7748,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (task) {
event->attach_state = PERF_ATTACH_TASK;
-
- if (attr->type == PERF_TYPE_TRACEPOINT)
- event->hw.tp_target = task;
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
- * hw_breakpoint is a bit difficult here..
+ * XXX pmu::event_init needs to know what task to account to
+ * and we cannot use the ctx information because we need the
+ * pmu before we get a ctx.
*/
- else if (attr->type == PERF_TYPE_BREAKPOINT)
- event->hw.bp_target = task;
-#endif
+ event->hw.target = task;
}
+ event->clock = &local_clock;
+ if (parent_event)
+ event->clock = parent_event->clock;
+
if (!overflow_handler && parent_event) {
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
@@ -7224,6 +7786,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
goto err_ns;
+ if (!has_branch_stack(event))
+ event->attr.branch_sample_type = 0;
+
+ if (cgroup_fd != -1) {
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ if (err)
+ goto err_ns;
+ }
+
pmu = perf_init_event(event);
if (!pmu)
goto err_ns;
@@ -7232,21 +7803,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_ns;
}
+ err = exclusive_event_init(event);
+ if (err)
+ goto err_pmu;
+
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers();
if (err)
- goto err_pmu;
+ goto err_per_task;
}
}
return event;
+err_per_task:
+ exclusive_event_destroy(event);
+
err_pmu:
if (event->destroy)
event->destroy(event);
module_put(pmu->module);
err_ns:
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
if (event->ns)
put_pid_ns(event->ns);
kfree(event);
@@ -7409,6 +7989,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
if (output_event->cpu == -1 && output_event->ctx != event->ctx)
goto out;
+ /*
+ * Mixing clocks in the same buffer is trouble you don't need.
+ */
+ if (output_event->clock != event->clock)
+ goto out;
+
+ /*
+ * If both events generate aux data, they must be on the same PMU
+ */
+ if (has_aux(event) && has_aux(output_event) &&
+ event->pmu != output_event->pmu)
+ goto out;
+
set:
mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
@@ -7441,6 +8034,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+ bool nmi_safe = false;
+
+ switch (clk_id) {
+ case CLOCK_MONOTONIC:
+ event->clock = &ktime_get_mono_fast_ns;
+ nmi_safe = true;
+ break;
+
+ case CLOCK_MONOTONIC_RAW:
+ event->clock = &ktime_get_raw_fast_ns;
+ nmi_safe = true;
+ break;
+
+ case CLOCK_REALTIME:
+ event->clock = &ktime_get_real_ns;
+ break;
+
+ case CLOCK_BOOTTIME:
+ event->clock = &ktime_get_boot_ns;
+ break;
+
+ case CLOCK_TAI:
+ event->clock = &ktime_get_tai_ns;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+ return -EINVAL;
+
+ return 0;
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -7465,6 +8095,7 @@ SYSCALL_DEFINE5(perf_event_open,
int move_group = 0;
int err;
int f_flags = O_RDWR;
+ int cgroup_fd = -1;
/* for future expandability... */
if (flags & ~PERF_FLAG_ALL)
@@ -7530,21 +8161,16 @@ SYSCALL_DEFINE5(perf_event_open,
get_online_cpus();
+ if (flags & PERF_FLAG_PID_CGROUP)
+ cgroup_fd = pid;
+
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
- NULL, NULL);
+ NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
goto err_cpus;
}
- if (flags & PERF_FLAG_PID_CGROUP) {
- err = perf_cgroup_connect(pid, event, &attr, group_leader);
- if (err) {
- __free_event(event);
- goto err_cpus;
- }
- }
-
if (is_sampling_event(event)) {
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
err = -ENOTSUPP;
@@ -7560,6 +8186,12 @@ SYSCALL_DEFINE5(perf_event_open,
*/
pmu = event->pmu;
+ if (attr.use_clockid) {
+ err = perf_event_set_clock(event, attr.clockid);
+ if (err)
+ goto err_alloc;
+ }
+
if (group_leader &&
(is_software_event(event) != is_software_event(group_leader))) {
if (is_software_event(event)) {
@@ -7586,12 +8218,17 @@ SYSCALL_DEFINE5(perf_event_open,
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, event->cpu);
+ ctx = find_get_context(pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_alloc;
}
+ if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
+ err = -EBUSY;
+ goto err_context;
+ }
+
if (task) {
put_task_struct(task);
task = NULL;
@@ -7609,6 +8246,11 @@ SYSCALL_DEFINE5(perf_event_open,
*/
if (group_leader->group_leader != group_leader)
goto err_context;
+
+ /* All events in a group should have the same clock */
+ if (group_leader->clock != event->clock)
+ goto err_context;
+
/*
* Do not allow to attach to a group in a different
* task or CPU context:
@@ -7709,6 +8351,13 @@ SYSCALL_DEFINE5(perf_event_open,
get_ctx(ctx);
}
+ if (!exclusive_event_installable(event, ctx)) {
+ err = -EBUSY;
+ mutex_unlock(&ctx->mutex);
+ fput(event_file);
+ goto err_context;
+ }
+
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
@@ -7781,7 +8430,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
*/
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
- overflow_handler, context);
+ overflow_handler, context, -1);
if (IS_ERR(event)) {
err = PTR_ERR(event);
goto err;
@@ -7792,7 +8441,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
account_event(event);
- ctx = find_get_context(event->pmu, task, cpu);
+ ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_free;
@@ -7800,6 +8449,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
+ if (!exclusive_event_installable(event, ctx)) {
+ mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
+ err = -EBUSY;
+ goto err_free;
+ }
+
perf_install_in_context(ctx, event, cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -8114,6 +8771,31 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
+struct perf_event *perf_event_get(unsigned int fd)
+{
+ int err;
+ struct fd f;
+ struct perf_event *event;
+
+ err = perf_fget_light(fd, &f);
+ if (err)
+ return ERR_PTR(err);
+
+ event = f.file->private_data;
+ atomic_long_inc(&event->refcount);
+ fdput(f);
+
+ return event;
+}
+
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+ if (!event)
+ return ERR_PTR(-EINVAL);
+
+ return &event->attr;
+}
+
/*
* inherit a event from parent task to child task:
*/
@@ -8142,7 +8824,7 @@ inherit_event(struct perf_event *parent_event,
parent_event->cpu,
child,
group_leader, parent_event,
- NULL, NULL);
+ NULL, NULL, -1);
if (IS_ERR(child_event))
return child_event;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9803a6600d49..92ce5f4ccc26 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
- struct task_struct *tsk = bp->hw.bp_target;
+ struct task_struct *tsk = bp->hw.target;
struct perf_event *iter;
int count = 0;
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.bp_target == tsk &&
+ if (iter->hw.target == tsk &&
find_slot_idx(iter) == type &&
(iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
int nr;
nr = info->cpu_pinned;
- if (!bp->hw.bp_target)
+ if (!bp->hw.target)
nr += max_task_bp_pinned(cpu, type);
else
nr += task_bp_pinned(cpu, bp, type);
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
weight = -weight;
/* Pinned counter cpu profiling */
- if (!bp->hw.bp_target) {
+ if (!bp->hw.target) {
get_bp_info(bp->cpu, type)->cpu_pinned += weight;
return;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 569b218782ad..2bbad9c1274c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -11,6 +11,7 @@
struct ring_buffer {
atomic_t refcount;
struct rcu_head rcu_head;
+ struct irq_work irq_work;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
int page_order; /* allocation order */
@@ -27,6 +28,7 @@ struct ring_buffer {
local_t lost; /* nr records lost */
long watermark; /* wakeup watermark */
+ long aux_watermark;
/* poll crap */
spinlock_t event_lock;
struct list_head event_list;
@@ -35,23 +37,50 @@ struct ring_buffer {
unsigned long mmap_locked;
struct user_struct *mmap_user;
+ /* AUX area */
+ local_t aux_head;
+ local_t aux_nest;
+ local_t aux_wakeup;
+ unsigned long aux_pgoff;
+ int aux_nr_pages;
+ int aux_overwrite;
+ atomic_t aux_mmap_count;
+ unsigned long aux_mmap_locked;
+ void (*free_aux)(void *);
+ atomic_t aux_refcount;
+ void **aux_pages;
+ void *aux_priv;
+
struct perf_event_mmap_page *user_page;
void *data_pages[0];
};
extern void rb_free(struct ring_buffer *rb);
+
+static inline void rb_free_rcu(struct rcu_head *rcu_head)
+{
+ struct ring_buffer *rb;
+
+ rb = container_of(rcu_head, struct ring_buffer, rcu_head);
+ rb_free(rb);
+}
+
extern struct ring_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
+extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+ pgoff_t pgoff, int nr_pages, long watermark, int flags);
+extern void rb_free_aux(struct ring_buffer *rb);
+extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
+extern void ring_buffer_put(struct ring_buffer *rb);
-extern void
-perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
- struct perf_event *event);
-extern void
-perf_event__output_id_sample(struct perf_event *event,
- struct perf_output_handle *handle,
- struct perf_sample_data *sample);
+static inline bool rb_has_aux(struct ring_buffer *rb)
+{
+ return !!rb->aux_nr_pages;
+}
+
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+ unsigned long size, u64 flags);
extern struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
@@ -81,6 +110,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
+static inline unsigned long perf_aux_size(struct ring_buffer *rb)
+{
+ return rb->aux_nr_pages << PAGE_SHIFT;
+}
+
#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
static inline unsigned long \
func_name(struct perf_output_handle *handle, \
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index eadb95ce7aac..182bc30899d5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle,
perf_output_get_handle(handle);
do {
- tail = ACCESS_ONCE(rb->user_page->data_tail);
+ tail = READ_ONCE_CTRL(rb->user_page->data_tail);
offset = head = local_read(&rb->head);
if (!rb->overwrite &&
unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
@@ -221,6 +221,8 @@ void perf_output_end(struct perf_output_handle *handle)
rcu_read_unlock();
}
+static void rb_irq_work(struct irq_work *work);
+
static void
ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
{
@@ -241,6 +243,349 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
+ init_irq_work(&rb->irq_work, rb_irq_work);
+}
+
+static void ring_buffer_put_async(struct ring_buffer *rb)
+{
+ if (!atomic_dec_and_test(&rb->refcount))
+ return;
+
+ rb->rcu_head.next = (void *)rb;
+ irq_work_queue(&rb->irq_work);
+}
+
+/*
+ * This is called before hardware starts writing to the AUX area to
+ * obtain an output handle and make sure there's room in the buffer.
+ * When the capture completes, call perf_aux_output_end() to commit
+ * the recorded data to the buffer.
+ *
+ * The ordering is similar to that of perf_output_{begin,end}, with
+ * the exception of (B), which should be taken care of by the pmu
+ * driver, since ordering rules will differ depending on hardware.
+ */
+void *perf_aux_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event)
+{
+ struct perf_event *output_event = event;
+ unsigned long aux_head, aux_tail;
+ struct ring_buffer *rb;
+
+ if (output_event->parent)
+ output_event = output_event->parent;
+
+ /*
+ * Since this will typically be open across pmu::add/pmu::del, we
+ * grab ring_buffer's refcount instead of holding rcu read lock
+ * to make sure it doesn't disappear under us.
+ */
+ rb = ring_buffer_get(output_event);
+ if (!rb)
+ return NULL;
+
+ if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+ goto err;
+
+ /*
+ * Nesting is not supported for AUX area, make sure nested
+ * writers are caught early
+ */
+ if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+ goto err_put;
+
+ aux_head = local_read(&rb->aux_head);
+
+ handle->rb = rb;
+ handle->event = event;
+ handle->head = aux_head;
+ handle->size = 0;
+
+ /*
+ * In overwrite mode, AUX data stores do not depend on aux_tail,
+ * therefore (A) control dependency barrier does not exist. The
+ * (B) <-> (C) ordering is still observed by the pmu driver.
+ */
+ if (!rb->aux_overwrite) {
+ aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+ handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+ if (aux_head - aux_tail < perf_aux_size(rb))
+ handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
+
+ /*
+ * handle->size computation depends on aux_tail load; this forms a
+ * control dependency barrier separating aux_tail load from aux data
+ * store that will be enabled on successful return
+ */
+ if (!handle->size) { /* A, matches D */
+ event->pending_disable = 1;
+ perf_output_wakeup(handle);
+ local_set(&rb->aux_nest, 0);
+ goto err_put;
+ }
+ }
+
+ return handle->rb->aux_priv;
+
+err_put:
+ rb_free_aux(rb);
+
+err:
+ ring_buffer_put_async(rb);
+ handle->event = NULL;
+
+ return NULL;
+}
+
+/*
+ * Commit the data written by hardware into the ring buffer by adjusting
+ * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
+ * pmu driver's responsibility to observe ordering rules of the hardware,
+ * so that all the data is externally visible before this is called.
+ */
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+ bool truncated)
+{
+ struct ring_buffer *rb = handle->rb;
+ unsigned long aux_head;
+ u64 flags = 0;
+
+ if (truncated)
+ flags |= PERF_AUX_FLAG_TRUNCATED;
+
+ /* in overwrite mode, driver provides aux_head via handle */
+ if (rb->aux_overwrite) {
+ flags |= PERF_AUX_FLAG_OVERWRITE;
+
+ aux_head = handle->head;
+ local_set(&rb->aux_head, aux_head);
+ } else {
+ aux_head = local_read(&rb->aux_head);
+ local_add(size, &rb->aux_head);
+ }
+
+ if (size || flags) {
+ /*
+ * Only send RECORD_AUX if we have something useful to communicate
+ */
+
+ perf_event_aux_event(handle->event, aux_head, size, flags);
+ }
+
+ aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+
+ if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ perf_output_wakeup(handle);
+ local_add(rb->aux_watermark, &rb->aux_wakeup);
+ }
+ handle->event = NULL;
+
+ local_set(&rb->aux_nest, 0);
+ rb_free_aux(rb);
+ ring_buffer_put_async(rb);
+}
+
+/*
+ * Skip over a given number of bytes in the AUX buffer, due to, for example,
+ * hardware's alignment constraints.
+ */
+int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
+{
+ struct ring_buffer *rb = handle->rb;
+ unsigned long aux_head;
+
+ if (size > handle->size)
+ return -ENOSPC;
+
+ local_add(size, &rb->aux_head);
+
+ aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+ if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ perf_output_wakeup(handle);
+ local_add(rb->aux_watermark, &rb->aux_wakeup);
+ handle->wakeup = local_read(&rb->aux_wakeup) +
+ rb->aux_watermark;
+ }
+
+ handle->head = aux_head;
+ handle->size -= size;
+
+ return 0;
+}
+
+void *perf_get_aux(struct perf_output_handle *handle)
+{
+ /* this is only valid between perf_aux_output_begin and *_end */
+ if (!handle->event)
+ return NULL;
+
+ return handle->rb->aux_priv;
+}
+
+#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
+
+static struct page *rb_alloc_aux_page(int node, int order)
+{
+ struct page *page;
+
+ if (order > MAX_ORDER)
+ order = MAX_ORDER;
+
+ do {
+ page = alloc_pages_node(node, PERF_AUX_GFP, order);
+ } while (!page && order--);
+
+ if (page && order) {
+ /*
+ * Communicate the allocation size to the driver:
+ * if we managed to secure a high-order allocation,
+ * set its first page's private to this order;
+ * !PagePrivate(page) means it's just a normal page.
+ */
+ split_page(page, order);
+ SetPagePrivate(page);
+ set_page_private(page, order);
+ }
+
+ return page;
+}
+
+static void rb_free_aux_page(struct ring_buffer *rb, int idx)
+{
+ struct page *page = virt_to_page(rb->aux_pages[idx]);
+
+ ClearPagePrivate(page);
+ page->mapping = NULL;
+ __free_page(page);
+}
+
+int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+ pgoff_t pgoff, int nr_pages, long watermark, int flags)
+{
+ bool overwrite = !(flags & RING_BUFFER_WRITABLE);
+ int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
+ int ret = -ENOMEM, max_order = 0;
+
+ if (!has_aux(event))
+ return -ENOTSUPP;
+
+ if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
+ /*
+ * We need to start with the max_order that fits in nr_pages,
+ * not the other way around, hence ilog2() and not get_order.
+ */
+ max_order = ilog2(nr_pages);
+
+ /*
+ * PMU requests more than one contiguous chunks of memory
+ * for SW double buffering
+ */
+ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
+ !overwrite) {
+ if (!max_order)
+ return -EINVAL;
+
+ max_order--;
+ }
+ }
+
+ rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
+ if (!rb->aux_pages)
+ return -ENOMEM;
+
+ rb->free_aux = event->pmu->free_aux;
+ for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
+ struct page *page;
+ int last, order;
+
+ order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
+ page = rb_alloc_aux_page(node, order);
+ if (!page)
+ goto out;
+
+ for (last = rb->aux_nr_pages + (1 << page_private(page));
+ last > rb->aux_nr_pages; rb->aux_nr_pages++)
+ rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
+ }
+
+ /*
+ * In overwrite mode, PMUs that don't support SG may not handle more
+ * than one contiguous allocation, since they rely on PMI to do double
+ * buffering. In this case, the entire buffer has to be one contiguous
+ * chunk.
+ */
+ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
+ overwrite) {
+ struct page *page = virt_to_page(rb->aux_pages[0]);
+
+ if (page_private(page) != max_order)
+ goto out;
+ }
+
+ rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+ overwrite);
+ if (!rb->aux_priv)
+ goto out;
+
+ ret = 0;
+
+ /*
+ * aux_pages (and pmu driver's private data, aux_priv) will be
+ * referenced in both producer's and consumer's contexts, thus
+ * we keep a refcount here to make sure either of the two can
+ * reference them safely.
+ */
+ atomic_set(&rb->aux_refcount, 1);
+
+ rb->aux_overwrite = overwrite;
+ rb->aux_watermark = watermark;
+
+ if (!rb->aux_watermark && !rb->aux_overwrite)
+ rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
+
+out:
+ if (!ret)
+ rb->aux_pgoff = pgoff;
+ else
+ rb_free_aux(rb);
+
+ return ret;
+}
+
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+ int pg;
+
+ if (rb->aux_priv) {
+ rb->free_aux(rb->aux_priv);
+ rb->free_aux = NULL;
+ rb->aux_priv = NULL;
+ }
+
+ if (rb->aux_nr_pages) {
+ for (pg = 0; pg < rb->aux_nr_pages; pg++)
+ rb_free_aux_page(rb, pg);
+
+ kfree(rb->aux_pages);
+ rb->aux_nr_pages = 0;
+ }
+}
+
+void rb_free_aux(struct ring_buffer *rb)
+{
+ if (atomic_dec_and_test(&rb->aux_refcount))
+ irq_work_queue(&rb->irq_work);
+}
+
+static void rb_irq_work(struct irq_work *work)
+{
+ struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
+
+ if (!atomic_read(&rb->aux_refcount))
+ __rb_free_aux(rb);
+
+ if (rb->rcu_head.next == (void *)rb)
+ call_rcu(&rb->rcu_head, rb_free_rcu);
}
#ifndef CONFIG_PERF_USE_VMALLOC
@@ -249,8 +594,8 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
* Back perf_mmap() with regular GFP_KERNEL-0 pages.
*/
-struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+static struct page *
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
if (pgoff > rb->nr_pages)
return NULL;
@@ -340,8 +685,8 @@ static int data_page_nr(struct ring_buffer *rb)
return rb->nr_pages << page_order(rb);
}
-struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+static struct page *
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
/* The '>' counts in the user page. */
if (pgoff > data_page_nr(rb))
@@ -416,3 +761,19 @@ fail:
}
#endif
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+ if (rb->aux_nr_pages) {
+ /* above AUX space */
+ if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
+ return NULL;
+
+ /* AUX space */
+ if (pgoff >= rb->aux_pgoff)
+ return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+ }
+
+ return __perf_mmap_to_page(rb, pgoff);
+}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f26a22d..4e5e9798aa0c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -86,15 +86,6 @@ struct uprobe {
struct arch_uprobe arch;
};
-struct return_instance {
- struct uprobe *uprobe;
- unsigned long func;
- unsigned long orig_ret_vaddr; /* original return address */
- bool chained; /* true, if instance is nested */
-
- struct return_instance *next; /* keep as stack */
-};
-
/*
* Execute out of line area: anonymous executable mapping installed
* by the probed task to execute the copy of the original instruction
@@ -105,17 +96,18 @@ struct return_instance {
* allocated.
*/
struct xol_area {
- wait_queue_head_t wq; /* if all slots are busy */
- atomic_t slot_count; /* number of in-use slots */
- unsigned long *bitmap; /* 0 = free slot */
- struct page *page;
+ wait_queue_head_t wq; /* if all slots are busy */
+ atomic_t slot_count; /* number of in-use slots */
+ unsigned long *bitmap; /* 0 = free slot */
+ struct vm_special_mapping xol_mapping;
+ struct page *pages[2];
/*
* We keep the vma's vm_start rather than a pointer to the vma
* itself. The probed process or a naughty kernel module could make
* the vma go away, and we must handle that reasonably gracefully.
*/
- unsigned long vaddr; /* Page(s) of instruction slots */
+ unsigned long vaddr; /* Page(s) of instruction slots */
};
/*
@@ -366,6 +358,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
}
+static struct uprobe *get_uprobe(struct uprobe *uprobe)
+{
+ atomic_inc(&uprobe->ref);
+ return uprobe;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+ if (atomic_dec_and_test(&uprobe->ref))
+ kfree(uprobe);
+}
+
static int match_uprobe(struct uprobe *l, struct uprobe *r)
{
if (l->inode < r->inode)
@@ -393,10 +397,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
while (n) {
uprobe = rb_entry(n, struct uprobe, rb_node);
match = match_uprobe(&u, uprobe);
- if (!match) {
- atomic_inc(&uprobe->ref);
- return uprobe;
- }
+ if (!match)
+ return get_uprobe(uprobe);
if (match < 0)
n = n->rb_left;
@@ -432,10 +434,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
parent = *p;
u = rb_entry(parent, struct uprobe, rb_node);
match = match_uprobe(uprobe, u);
- if (!match) {
- atomic_inc(&u->ref);
- return u;
- }
+ if (!match)
+ return get_uprobe(u);
if (match < 0)
p = &parent->rb_left;
@@ -472,12 +472,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
return u;
}
-static void put_uprobe(struct uprobe *uprobe)
-{
- if (atomic_dec_and_test(&uprobe->ref))
- kfree(uprobe);
-}
-
static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
{
struct uprobe *uprobe, *cur_uprobe;
@@ -1039,14 +1033,14 @@ static void build_probe_list(struct inode *inode,
if (u->inode != inode || u->offset < min)
break;
list_add(&u->pending_list, head);
- atomic_inc(&u->ref);
+ get_uprobe(u);
}
for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset > max)
break;
list_add(&u->pending_list, head);
- atomic_inc(&u->ref);
+ get_uprobe(u);
}
}
spin_unlock(&uprobes_treelock);
@@ -1132,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
- int ret = -EALREADY;
+ struct vm_area_struct *vma;
+ int ret;
down_write(&mm->mmap_sem);
- if (mm->uprobes_state.xol_area)
+ if (mm->uprobes_state.xol_area) {
+ ret = -EALREADY;
goto fail;
+ }
if (!area->vaddr) {
/* Try to map as high as possible, this is only a hint. */
@@ -1148,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
}
}
- ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
- VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
- if (ret)
+ vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
+ &area->xol_mapping);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
goto fail;
+ }
+ ret = 0;
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
fail:
@@ -1175,21 +1176,24 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->page = alloc_page(GFP_HIGHUSER);
- if (!area->page)
+ area->xol_mapping.name = "[uprobes]";
+ area->xol_mapping.pages = area->pages;
+ area->pages[0] = alloc_page(GFP_HIGHUSER);
+ if (!area->pages[0])
goto free_bitmap;
+ area->pages[1] = NULL;
area->vaddr = vaddr;
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
atomic_set(&area->slot_count, 1);
- copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
if (!xol_add_vma(mm, area))
return area;
- __free_page(area->page);
+ __free_page(area->pages[0]);
free_bitmap:
kfree(area->bitmap);
free_area:
@@ -1227,7 +1231,7 @@ void uprobe_clear_state(struct mm_struct *mm)
if (!area)
return;
- put_page(area->page);
+ put_page(area->pages[0]);
kfree(area->bitmap);
kfree(area);
}
@@ -1296,7 +1300,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
if (unlikely(!xol_vaddr))
return 0;
- arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
return xol_vaddr;
@@ -1333,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
clear_bit(slot_nr, area->bitmap);
atomic_dec(&area->slot_count);
+ smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
if (waitqueue_active(&area->wq))
wake_up(&area->wq);
@@ -1376,6 +1381,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
return instruction_pointer(regs);
}
+static struct return_instance *free_ret_instance(struct return_instance *ri)
+{
+ struct return_instance *next = ri->next;
+ put_uprobe(ri->uprobe);
+ kfree(ri);
+ return next;
+}
+
/*
* Called with no locks held.
* Called in context of a exiting or a exec-ing thread.
@@ -1383,7 +1396,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
- struct return_instance *ri, *tmp;
+ struct return_instance *ri;
if (!utask)
return;
@@ -1392,13 +1405,8 @@ void uprobe_free_utask(struct task_struct *t)
put_uprobe(utask->active_uprobe);
ri = utask->return_instances;
- while (ri) {
- tmp = ri;
- ri = ri->next;
-
- put_uprobe(tmp->uprobe);
- kfree(tmp);
- }
+ while (ri)
+ ri = free_ret_instance(ri);
xol_free_insn_slot(t);
kfree(utask);
@@ -1437,7 +1445,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
return -ENOMEM;
*n = *o;
- atomic_inc(&n->uprobe->ref);
+ get_uprobe(n->uprobe);
n->next = NULL;
*p = n;
@@ -1515,12 +1523,25 @@ static unsigned long get_trampoline_vaddr(void)
return trampoline_vaddr;
}
+static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
+ struct pt_regs *regs)
+{
+ struct return_instance *ri = utask->return_instances;
+ enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
+
+ while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
+ ri = free_ret_instance(ri);
+ utask->depth--;
+ }
+ utask->return_instances = ri;
+}
+
static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
{
struct return_instance *ri;
struct uprobe_task *utask;
unsigned long orig_ret_vaddr, trampoline_vaddr;
- bool chained = false;
+ bool chained;
if (!get_xol_area())
return;
@@ -1536,49 +1557,47 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
return;
}
- ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+ ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!ri)
- goto fail;
+ return;
trampoline_vaddr = get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
if (orig_ret_vaddr == -1)
goto fail;
+ /* drop the entries invalidated by longjmp() */
+ chained = (orig_ret_vaddr == trampoline_vaddr);
+ cleanup_return_instances(utask, chained, regs);
+
/*
* We don't want to keep trampoline address in stack, rather keep the
* original return address of first caller thru all the consequent
* instances. This also makes breakpoint unwrapping easier.
*/
- if (orig_ret_vaddr == trampoline_vaddr) {
+ if (chained) {
if (!utask->return_instances) {
/*
* This situation is not possible. Likely we have an
* attack from user-space.
*/
- pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
- current->pid, current->tgid);
+ uprobe_warn(current, "handle tail call");
goto fail;
}
-
- chained = true;
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
- atomic_inc(&uprobe->ref);
- ri->uprobe = uprobe;
+ ri->uprobe = get_uprobe(uprobe);
ri->func = instruction_pointer(regs);
+ ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
ri->chained = chained;
utask->depth++;
-
- /* add instance to the stack */
ri->next = utask->return_instances;
utask->return_instances = ri;
return;
-
fail:
kfree(ri);
}
@@ -1766,46 +1785,58 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
up_read(&uprobe->register_rwsem);
}
-static bool handle_trampoline(struct pt_regs *regs)
+static struct return_instance *find_next_ret_chain(struct return_instance *ri)
{
- struct uprobe_task *utask;
- struct return_instance *ri, *tmp;
bool chained;
+ do {
+ chained = ri->chained;
+ ri = ri->next; /* can't be NULL if chained */
+ } while (chained);
+
+ return ri;
+}
+
+static void handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *next;
+ bool valid;
+
utask = current->utask;
if (!utask)
- return false;
+ goto sigill;
ri = utask->return_instances;
if (!ri)
- return false;
-
- /*
- * TODO: we should throw out return_instance's invalidated by
- * longjmp(), currently we assume that the probed function always
- * returns.
- */
- instruction_pointer_set(regs, ri->orig_ret_vaddr);
-
- for (;;) {
- handle_uretprobe_chain(ri, regs);
-
- chained = ri->chained;
- put_uprobe(ri->uprobe);
-
- tmp = ri;
- ri = ri->next;
- kfree(tmp);
- utask->depth--;
+ goto sigill;
- if (!chained)
- break;
- BUG_ON(!ri);
- }
+ do {
+ /*
+ * We should throw out the frames invalidated by longjmp().
+ * If this chain is valid, then the next one should be alive
+ * or NULL; the latter case means that nobody but ri->func
+ * could hit this trampoline on return. TODO: sigaltstack().
+ */
+ next = find_next_ret_chain(ri);
+ valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
+
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+ do {
+ if (valid)
+ handle_uretprobe_chain(ri, regs);
+ ri = free_ret_instance(ri);
+ utask->depth--;
+ } while (ri != next);
+ } while (!valid);
utask->return_instances = ri;
+ return;
+
+ sigill:
+ uprobe_warn(current, "handle uretprobe, sending SIGILL.");
+ force_sig_info(SIGILL, SEND_SIG_FORCED, current);
- return true;
}
bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
@@ -1813,6 +1844,12 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
return false;
}
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+ struct pt_regs *regs)
+{
+ return true;
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1824,13 +1861,8 @@ static void handle_swbp(struct pt_regs *regs)
int uninitialized_var(is_swbp);
bp_vaddr = uprobe_get_swbp_addr(regs);
- if (bp_vaddr == get_trampoline_vaddr()) {
- if (handle_trampoline(regs))
- return;
-
- pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
- current->pid, current->tgid);
- }
+ if (bp_vaddr == get_trampoline_vaddr())
+ return handle_trampoline(regs);
uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 83d4382f5699..6873bb3e6b7e 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -20,145 +20,10 @@
#include <linux/types.h>
#include <linux/fs_struct.h>
-
-static void default_handler(int, struct pt_regs *);
-
-static struct exec_domain *exec_domains = &default_exec_domain;
-static DEFINE_RWLOCK(exec_domains_lock);
-
-
-static unsigned long ident_map[32] = {
- 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23,
- 24, 25, 26, 27, 28, 29, 30, 31
-};
-
-struct exec_domain default_exec_domain = {
- .name = "Linux", /* name */
- .handler = default_handler, /* lcall7 causes a seg fault. */
- .pers_low = 0, /* PER_LINUX personality. */
- .pers_high = 0, /* PER_LINUX personality. */
- .signal_map = ident_map, /* Identity map signals. */
- .signal_invmap = ident_map, /* - both ways. */
-};
-
-
-static void
-default_handler(int segment, struct pt_regs *regp)
-{
- set_personality(0);
-
- if (current_thread_info()->exec_domain->handler != default_handler)
- current_thread_info()->exec_domain->handler(segment, regp);
- else
- send_sig(SIGSEGV, current, 1);
-}
-
-static struct exec_domain *
-lookup_exec_domain(unsigned int personality)
-{
- unsigned int pers = personality(personality);
- struct exec_domain *ep;
-
- read_lock(&exec_domains_lock);
- for (ep = exec_domains; ep; ep = ep->next) {
- if (pers >= ep->pers_low && pers <= ep->pers_high)
- if (try_module_get(ep->module))
- goto out;
- }
-
-#ifdef CONFIG_MODULES
- read_unlock(&exec_domains_lock);
- request_module("personality-%d", pers);
- read_lock(&exec_domains_lock);
-
- for (ep = exec_domains; ep; ep = ep->next) {
- if (pers >= ep->pers_low && pers <= ep->pers_high)
- if (try_module_get(ep->module))
- goto out;
- }
-#endif
-
- ep = &default_exec_domain;
-out:
- read_unlock(&exec_domains_lock);
- return ep;
-}
-
-int
-register_exec_domain(struct exec_domain *ep)
-{
- struct exec_domain *tmp;
- int err = -EBUSY;
-
- if (ep == NULL)
- return -EINVAL;
-
- if (ep->next != NULL)
- return -EBUSY;
-
- write_lock(&exec_domains_lock);
- for (tmp = exec_domains; tmp; tmp = tmp->next) {
- if (tmp == ep)
- goto out;
- }
-
- ep->next = exec_domains;
- exec_domains = ep;
- err = 0;
-
-out:
- write_unlock(&exec_domains_lock);
- return err;
-}
-EXPORT_SYMBOL(register_exec_domain);
-
-int
-unregister_exec_domain(struct exec_domain *ep)
-{
- struct exec_domain **epp;
-
- epp = &exec_domains;
- write_lock(&exec_domains_lock);
- for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
- if (ep == *epp)
- goto unregister;
- }
- write_unlock(&exec_domains_lock);
- return -EINVAL;
-
-unregister:
- *epp = ep->next;
- ep->next = NULL;
- write_unlock(&exec_domains_lock);
- return 0;
-}
-EXPORT_SYMBOL(unregister_exec_domain);
-
-int __set_personality(unsigned int personality)
-{
- struct exec_domain *oep = current_thread_info()->exec_domain;
-
- current_thread_info()->exec_domain = lookup_exec_domain(personality);
- current->personality = personality;
- module_put(oep->module);
-
- return 0;
-}
-EXPORT_SYMBOL(__set_personality);
-
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
{
- struct exec_domain *ep;
-
- read_lock(&exec_domains_lock);
- for (ep = exec_domains; ep; ep = ep->next)
- seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
- ep->pers_low, ep->pers_high, ep->name,
- module_name(ep->module));
- read_unlock(&exec_domains_lock);
+ seq_puts(m, "0-0\tLinux \t[kernel]\n");
return 0;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index feff10bbb307..ea95ee1b5ef7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk)
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
- unmark_oom_victim();
+ exit_oom_victim();
}
static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -711,10 +711,10 @@ void do_exit(long code)
current->comm, task_pid_nr(current),
preempt_count());
- acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
+ acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
@@ -756,8 +756,6 @@ void do_exit(long code)
cgroup_exit(tsk);
- module_put(task_thread_info(tsk)->exec_domain->module);
-
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
@@ -1473,7 +1471,7 @@ static long do_wait(struct wait_opts *wo)
add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
repeat:
/*
- * If there is nothing that can match our critiera just get out.
+ * If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
* might later match our criteria, even if we are not able to reap
* it yet.
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139615a0..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -74,6 +74,7 @@
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
+#include <linux/sysctl.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -88,6 +89,16 @@
#include <trace/events/task.h>
/*
+ * Minimum number of threads to boot the kernel
+ */
+#define MIN_THREADS 20
+
+/*
+ * Maximum number of threads
+ */
+#define MAX_THREADS FUTEX_TID_MASK
+
+/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
unsigned long total_forks; /* Handle normal Linux uptimes. */
@@ -253,7 +264,35 @@ EXPORT_SYMBOL_GPL(__put_task_struct);
void __init __weak arch_task_cache_init(void) { }
-void __init fork_init(unsigned long mempages)
+/*
+ * set_max_threads
+ */
+static void set_max_threads(unsigned int max_threads_suggested)
+{
+ u64 threads;
+
+ /*
+ * The number of threads shall be limited such that the thread
+ * structures may only consume a small part of the available memory.
+ */
+ if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
+ threads = MAX_THREADS;
+ else
+ threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ (u64) THREAD_SIZE * 8UL);
+
+ if (threads > max_threads_suggested)
+ threads = max_threads_suggested;
+
+ max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
+}
+
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
+
+void __init fork_init(void)
{
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
@@ -261,25 +300,14 @@ void __init fork_init(unsigned long mempages)
#endif
/* create a slab on which task_structs can be allocated */
task_struct_cachep =
- kmem_cache_create("task_struct", sizeof(struct task_struct),
+ kmem_cache_create("task_struct", arch_task_struct_size,
ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
#endif
/* do the arch specific task caches init */
arch_task_cache_init();
- /*
- * The default maximum number of threads is set to a safe
- * value: the thread structures can take up at most half
- * of memory.
- */
- max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
-
- /*
- * we need to allow at least 20 threads to boot a system
- */
- if (max_threads < 20)
- max_threads = 20;
+ set_max_threads(MAX_THREADS);
init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
@@ -380,6 +408,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
*/
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+ /* No ordering required: file already has been exposed. */
+ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+
mm->total_vm = oldmm->total_vm;
mm->shared_vm = oldmm->shared_vm;
mm->exec_vm = oldmm->exec_vm;
@@ -423,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
tmp->vm_next = tmp->vm_prev = NULL;
+ tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
@@ -505,7 +537,13 @@ static inline void mm_free_pgd(struct mm_struct *mm)
pgd_free(mm, mm->pgd);
}
#else
-#define dup_mmap(mm, oldmm) (0)
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ down_write(&oldmm->mmap_sem);
+ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ up_write(&oldmm->mmap_sem);
+ return 0;
+}
#define mm_alloc_pgd(mm) (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
@@ -674,34 +712,53 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
+/**
+ * set_mm_exe_file - change a reference to the mm's executable file
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
+ *
+ * Main users are mmput() and sys_execve(). Callers prevent concurrent
+ * invocations: in mmput() nobody alive left, in execve task is single
+ * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
+ * mm->exe_file, but does so without using set_mm_exe_file() in order
+ * to do avoid the need for any locks.
+ */
void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
+ struct file *old_exe_file;
+
+ /*
+ * It is safe to dereference the exe_file without RCU as
+ * this function is only called if nobody else can access
+ * this mm -- see comment above for justification.
+ */
+ old_exe_file = rcu_dereference_raw(mm->exe_file);
+
if (new_exe_file)
get_file(new_exe_file);
- if (mm->exe_file)
- fput(mm->exe_file);
- mm->exe_file = new_exe_file;
+ rcu_assign_pointer(mm->exe_file, new_exe_file);
+ if (old_exe_file)
+ fput(old_exe_file);
}
+/**
+ * get_mm_exe_file - acquire a reference to the mm's executable file
+ *
+ * Returns %NULL if mm has no associated executable file.
+ * User must release file via fput().
+ */
struct file *get_mm_exe_file(struct mm_struct *mm)
{
struct file *exe_file;
- /* We need mmap_sem to protect against races with removal of exe_file */
- down_read(&mm->mmap_sem);
- exe_file = mm->exe_file;
- if (exe_file)
- get_file(exe_file);
- up_read(&mm->mmap_sem);
+ rcu_read_lock();
+ exe_file = rcu_dereference(mm->exe_file);
+ if (exe_file && !get_file_rcu(exe_file))
+ exe_file = NULL;
+ rcu_read_unlock();
return exe_file;
}
-
-static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
- /* It's safe to write the exe_file pointer without exe_file_lock because
- * this is called during fork when the task is not yet in /proc */
- newmm->exe_file = get_mm_exe_file(oldmm);
-}
+EXPORT_SYMBOL(get_mm_exe_file);
/**
* get_task_mm - acquire a reference to the task's mm
@@ -864,8 +921,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
if (!mm_init(mm, tsk))
goto fail_nomem;
- dup_mm_exe_file(oldmm, mm);
-
err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;
@@ -1018,6 +1073,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
rcu_assign_pointer(tsk->sighand, sig);
if (!sig)
return -ENOMEM;
+
atomic_set(&sig->count, 1);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
return 0;
@@ -1042,10 +1098,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
unsigned long cpu_limit;
- /* Thread group counters. */
- thread_group_cputime_init(sig);
-
- cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
sig->cputimer.running = 1;
@@ -1082,6 +1135,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
seqlock_init(&sig->stats_lock);
+ prev_cputime_init(&sig->prev_cputime);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
@@ -1095,10 +1149,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
-#ifdef CONFIG_CGROUPS
- init_rwsem(&sig->group_rwsem);
-#endif
-
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1192,10 +1242,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
- int trace)
+ int trace,
+ unsigned long tls)
{
int retval;
struct task_struct *p;
+ void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1230,10 +1282,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/*
* If the new process will be in a different pid or user namespace
- * do not allow it to share a thread group or signal handlers or
- * parent with the forking task.
+ * do not allow it to share a thread group with the forking task.
*/
- if (clone_flags & CLONE_SIGHAND) {
+ if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) !=
current->nsproxy->pid_ns_for_children))
@@ -1279,9 +1330,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
- if (!try_module_get(task_thread_info(p)->exec_domain->module))
- goto bad_fork_cleanup_count;
-
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
p->flags |= PF_FORKNOEXEC;
@@ -1295,9 +1343,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
+ prev_cputime_init(&p->prev_cputime);
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_init(&p->vtime_seqlock);
p->vtime_snap = 0;
@@ -1350,6 +1397,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
+
+ p->pagefault_disabled = 0;
+
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
@@ -1401,15 +1451,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, stack_start, stack_size, p);
+ retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
if (retval)
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
- retval = -ENOMEM;
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
- if (!pid)
+ if (IS_ERR(pid)) {
+ retval = PTR_ERR(pid);
goto bad_fork_cleanup_io;
+ }
}
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1469,6 +1520,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->task_works = NULL;
/*
+ * Ensure that the cgroup subsystem policies allow the new process to be
+ * forked. It should be noted the the new process's css_set can be changed
+ * between here and cgroup_post_fork() if an organisation operation is in
+ * progress.
+ */
+ retval = cgroup_can_fork(p, cgrp_ss_priv);
+ if (retval)
+ goto bad_fork_free_pid;
+
+ /*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
@@ -1504,7 +1565,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_free_pid;
+ goto bad_fork_cancel_cgroup;
}
if (likely(p->pid)) {
@@ -1546,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ cgroup_post_fork(p, cgrp_ss_priv);
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1556,6 +1617,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
+bad_fork_cancel_cgroup:
+ cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -1590,7 +1653,6 @@ bad_fork_cleanup_threadgroup_lock:
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
delayacct_tsk_free(p);
- module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
@@ -1613,7 +1675,7 @@ static inline void init_idle_pids(struct pid_link *links)
struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+ task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
if (!IS_ERR(task)) {
init_idle_pids(task->pids);
init_idle(task, cpu);
@@ -1628,11 +1690,12 @@ struct task_struct *fork_idle(int cpu)
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
-long do_fork(unsigned long clone_flags,
+long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
- int __user *child_tidptr)
+ int __user *child_tidptr,
+ unsigned long tls)
{
struct task_struct *p;
int trace = 0;
@@ -1657,7 +1720,7 @@ long do_fork(unsigned long clone_flags,
}
p = copy_process(clone_flags, stack_start, stack_size,
- child_tidptr, NULL, trace);
+ child_tidptr, NULL, trace, tls);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
@@ -1698,20 +1761,34 @@ long do_fork(unsigned long clone_flags,
return nr;
}
+#ifndef CONFIG_HAVE_COPY_THREAD_TLS
+/* For compatibility with architectures that call do_fork directly rather than
+ * using the syscall entry points below. */
+long do_fork(unsigned long clone_flags,
+ unsigned long stack_start,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr)
+{
+ return _do_fork(clone_flags, stack_start, stack_size,
+ parent_tidptr, child_tidptr, 0);
+}
+#endif
+
/*
* Create a kernel thread.
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
- return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
- (unsigned long)arg, NULL, NULL);
+ return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+ (unsigned long)arg, NULL, NULL, 0);
}
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
- return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+ return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -1722,8 +1799,8 @@ SYSCALL_DEFINE0(fork)
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
- 0, NULL, NULL);
+ return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+ 0, NULL, NULL, 0);
}
#endif
@@ -1731,27 +1808,27 @@ SYSCALL_DEFINE0(vfork)
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
- int, tls_val,
+ unsigned long, tls,
int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
int, stack_size,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#endif
{
- return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+ return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
}
#endif
@@ -1808,13 +1885,21 @@ static int check_unshare_flags(unsigned long unshare_flags)
CLONE_NEWUSER|CLONE_NEWPID))
return -EINVAL;
/*
- * Not implemented, but pretend it works if there is nothing to
- * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
- * needs to unshare vm.
+ * Not implemented, but pretend it works if there is nothing
+ * to unshare. Note that unsharing the address space or the
+ * signal handlers also need to unshare the signal queues (aka
+ * CLONE_THREAD).
*/
if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
- /* FIXME: get_task_mm() increments ->mm_users */
- if (atomic_read(&current->mm->mm_users) > 1)
+ if (!thread_group_empty(current))
+ return -EINVAL;
+ }
+ if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
+ if (atomic_read(&current->sighand->count) > 1)
+ return -EINVAL;
+ }
+ if (unshare_flags & CLONE_VM) {
+ if (!current_is_single_threaded())
return -EINVAL;
}
@@ -1878,21 +1963,22 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
int err;
/*
- * If unsharing a user namespace must also unshare the thread.
+ * If unsharing a user namespace must also unshare the thread group
+ * and unshare the filesystem root and working directories.
*/
if (unshare_flags & CLONE_NEWUSER)
unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
- * If unsharing a thread from a thread group, must also unshare vm.
- */
- if (unshare_flags & CLONE_THREAD)
- unshare_flags |= CLONE_VM;
- /*
* If unsharing vm, must also unshare signal handlers.
*/
if (unshare_flags & CLONE_VM)
unshare_flags |= CLONE_SIGHAND;
/*
+ * If unsharing a signal handlers, must also unshare the signal queues.
+ */
+ if (unshare_flags & CLONE_SIGHAND)
+ unshare_flags |= CLONE_THREAD;
+ /*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
@@ -2004,3 +2090,26 @@ int unshare_files(struct files_struct **displaced)
task_unlock(task);
return 0;
}
+
+int sysctl_max_threads(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int ret;
+ int threads = max_threads;
+ int min = MIN_THREADS;
+ int max = MAX_THREADS;
+
+ t = *table;
+ t.data = &threads;
+ t.extra1 = &min;
+ t.extra2 = &max;
+
+ ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ set_max_threads(threads);
+
+ return 0;
+}
diff --git a/kernel/futex.c b/kernel/futex.c
index 2a5e3830e953..6e443efc65f4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -64,6 +64,7 @@
#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/bootmem.h>
+#include <linux/fault-inject.h>
#include <asm/futex.h>
@@ -258,6 +259,66 @@ static unsigned long __read_mostly futex_hashsize;
static struct futex_hash_bucket *futex_queues;
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+
+static struct {
+ struct fault_attr attr;
+
+ u32 ignore_private;
+} fail_futex = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_private = 0,
+};
+
+static int __init setup_fail_futex(char *str)
+{
+ return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+
+static bool should_fail_futex(bool fshared)
+{
+ if (fail_futex.ignore_private && !fshared)
+ return false;
+
+ return should_fail(&fail_futex.attr, 1);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_futex_debugfs(void)
+{
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_futex", NULL,
+ &fail_futex.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ if (!debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private)) {
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+late_initcall(fail_futex_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+ return false;
+}
+#endif /* CONFIG_FAIL_FUTEX */
+
static inline void futex_get_mm(union futex_key *key)
{
atomic_inc(&key->private.mm->mm_count);
@@ -413,6 +474,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
return -EFAULT;
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -428,6 +492,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
}
again:
+ /* Ignore any VERIFY_READ mapping (futex common case) */
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
err = get_user_pages_fast(address, 1, 1, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
@@ -516,7 +584,7 @@ again:
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
*/
- if (ro) {
+ if (unlikely(should_fail_futex(fshared)) || ro) {
err = -EFAULT;
goto out;
}
@@ -900,7 +968,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
if (!p)
return -ESRCH;
- if (!p->mm) {
+ if (unlikely(p->flags & PF_KTHREAD)) {
put_task_struct(p);
return -EPERM;
}
@@ -974,6 +1042,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
u32 uninitialized_var(curval);
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
@@ -1015,12 +1086,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
if (get_futex_value_locked(&uval, uaddr))
return -EFAULT;
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
/*
* Detect deadlocks.
*/
if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
+ if ((unlikely(should_fail_futex(true))))
+ return -EDEADLK;
+
/*
* Lookup existing state first. If it exists, try to attach to
* its pi_state.
@@ -1090,9 +1167,11 @@ static void __unqueue_futex(struct futex_q *q)
/*
* The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
*/
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
{
struct task_struct *p = q->task;
@@ -1100,14 +1179,10 @@ static void wake_futex(struct futex_q *q)
return;
/*
- * We set q->lock_ptr = NULL _before_ we wake up the task. If
- * a non-futex wake up happens on another CPU then the task
- * might exit and p would dereference a non-existing task
- * struct. Prevent this by holding a reference on p across the
- * wake up.
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock. wake_q_add() grabs reference to p.
*/
- get_task_struct(p);
-
+ wake_q_add(wake_q, p);
__unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as
@@ -1117,16 +1192,16 @@ static void wake_futex(struct futex_q *q)
*/
smp_wmb();
q->lock_ptr = NULL;
-
- wake_up_state(p, TASK_NORMAL);
- put_task_struct(p);
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
+ struct futex_hash_bucket *hb)
{
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ WAKE_Q(wake_q);
+ bool deboost;
int ret = 0;
if (!pi_state)
@@ -1157,6 +1232,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+ if (unlikely(should_fail_futex(true)))
+ ret = -EFAULT;
+
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
ret = -EFAULT;
else if (curval != uval)
@@ -1178,7 +1256,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
raw_spin_unlock_irq(&new_owner->pi_lock);
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- rt_mutex_unlock(&pi_state->pi_mutex);
+
+ deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+
+ /*
+ * First unlock HB so the waiter does not spin on it once he got woken
+ * up. Second wake up the waiter before the priority is adjusted. If we
+ * deboost first (and lose our higher priority), then the task might get
+ * scheduled away before the wake up can take place.
+ */
+ spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
+ if (deboost)
+ rt_mutex_adjust_prio(current);
return 0;
}
@@ -1217,6 +1307,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
int ret;
+ WAKE_Q(wake_q);
if (!bitset)
return -EINVAL;
@@ -1244,13 +1335,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++ret >= nr_wake)
break;
}
}
spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
out_put_key:
put_futex_key(&key);
out:
@@ -1269,6 +1361,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
int ret, op_ret;
+ WAKE_Q(wake_q);
retry:
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1413,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@@ -1334,7 +1427,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++op_ret >= nr_wake2)
break;
}
@@ -1344,6 +1437,7 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
out_put_keys:
put_futex_key(&key2);
out_put_key1:
@@ -1443,6 +1537,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
/*
* Find the top_waiter and determine if there are additional waiters.
* If the caller intends to requeue more than 1 waiter to pifutex,
@@ -1503,6 +1600,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
+ WAKE_Q(wake_q);
if (requeue_pi) {
/*
@@ -1679,7 +1777,7 @@ retry_private:
* woken by futex_unlock_pi().
*/
if (++task_count <= nr_wake && !requeue_pi) {
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
continue;
}
@@ -1719,6 +1817,7 @@ retry_private:
out_unlock:
free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
hb_waiters_dec(hb2);
/*
@@ -2055,7 +2154,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
{
/*
* The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using set_mb() and
+ * wake it. set_current_state() is implemented using smp_store_mb() and
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
@@ -2063,11 +2162,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
queue_me(q, hb);
/* Arm the timer */
- if (timeout) {
+ if (timeout)
hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&timeout->timer))
- timeout->task = NULL;
- }
/*
* If we have been removed from the hash list, then another task
@@ -2255,8 +2351,11 @@ static long futex_wait_restart(struct restart_block *restart)
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block, it does PI, etc. (Due to
- * races the kernel might see a 0 value of the futex too.)
+ * if there are waiters then it will block as a consequence of relying
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
*/
static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
@@ -2287,6 +2386,10 @@ retry_private:
ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
if (unlikely(ret)) {
+ /*
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
+ */
switch (ret) {
case 1:
/* We got the lock. */
@@ -2412,13 +2515,23 @@ retry:
*/
match = futex_top_waiter(hb, &key);
if (match) {
- ret = wake_futex_pi(uaddr, uval, match);
+ ret = wake_futex_pi(uaddr, uval, match, hb);
+ /*
+ * In case of success wake_futex_pi dropped the hash
+ * bucket lock.
+ */
+ if (!ret)
+ goto out_putkey;
/*
* The atomic access to the futex value generated a
* pagefault, so retry the user-access and the wakeup:
*/
if (ret == -EFAULT)
goto pi_faulted;
+ /*
+ * wake_futex_pi has detected invalid state. Tell user
+ * space.
+ */
goto out_unlock;
}
@@ -2439,6 +2552,7 @@ retry:
out_unlock:
spin_unlock(&hb->lock);
+out_putkey:
put_futex_key(&key);
return ret;
@@ -2506,7 +2620,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initially wait on (non-pi)
* @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- * the same type, no requeueing from private to shared, etc.
+ * the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
@@ -2981,6 +3095,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+ return -EFAULT;
if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
if (!timespec_valid(&ts))
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index b358a802fd18..7080ae1eb6c1 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -18,6 +18,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/sched.h>
#include "gcov.h"
static int gcov_events_enabled;
@@ -91,6 +92,12 @@ void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_time_profile);
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
@@ -107,8 +114,10 @@ void gcov_enable_events(void)
gcov_events_enabled = 1;
/* Perform event callback for previously registered entries. */
- while ((info = gcov_info_next(info)))
+ while ((info = gcov_info_next(info))) {
gcov_event(GCOV_ADD, info);
+ cond_resched();
+ }
mutex_unlock(&gcov_lock);
}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 826ba9fb5e32..e25e92fb44fa 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1
+#define GCOV_COUNTERS 10
+#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
#else
#define GCOV_COUNTERS 8
diff --git a/kernel/groups.c b/kernel/groups.c
index 664411f171b5..74d431d25251 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -9,9 +9,6 @@
#include <linux/user_namespace.h>
#include <asm/uaccess.h>
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
struct group_info *groups_alloc(int gidsetsize)
{
struct group_info *group_info;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06db12434d72..e0f90c2b57aa 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
return;
rcu_read_lock();
- do_each_thread(g, t) {
+ for_each_process_thread(g, t) {
if (!max_count--)
goto unlock;
if (!--batch_count) {
@@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
- } while_each_thread(g, t);
+ }
unlock:
rcu_read_unlock();
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6f1c7a566b95..6e40a9539763 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -63,7 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
return -EINVAL;
type &= IRQ_TYPE_SENSE_MASK;
- ret = __irq_set_trigger(desc, irq, type);
+ ret = __irq_set_trigger(desc, type);
irq_put_desc_busunlock(desc, flags);
return ret;
}
@@ -187,7 +187,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
irq_enable(desc);
}
if (resend)
- check_irq_resend(desc, desc->irq_data.irq);
+ check_irq_resend(desc);
return ret;
}
@@ -315,7 +315,7 @@ void handle_nested_irq(unsigned int irq)
raw_spin_lock_irq(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
action = desc->action;
if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
@@ -328,7 +328,7 @@ void handle_nested_irq(unsigned int irq)
action_ret = action->thread_fn(action->irq, action->dev_id);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
+ note_interrupt(desc, action_ret);
raw_spin_lock_irq(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -391,7 +391,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
@@ -443,7 +443,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
@@ -515,7 +515,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
goto out;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
@@ -583,7 +583,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
goto out_unlock;
}
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
@@ -646,7 +646,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
goto out_eoi;
}
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
do {
if (unlikely(!desc->action))
@@ -675,7 +675,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -705,7 +705,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
irqreturn_t res;
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -719,15 +719,9 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
}
void
-__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
- const char *name)
+__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
+ int is_chained, const char *name)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
-
- if (!desc)
- return;
-
if (!handle) {
handle = handle_bad_irq;
} else {
@@ -749,13 +743,13 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
* right away.
*/
if (WARN_ON(is_chained))
- goto out;
+ return;
/* Try the parent */
irq_data = irq_data->parent_data;
}
#endif
if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
- goto out;
+ return;
}
/* Uninstall? */
@@ -774,12 +768,41 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
irq_settings_set_nothread(desc);
irq_startup(desc, true);
}
-out:
+}
+
+void
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+ const char *name)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+
+ if (!desc)
+ return;
+
+ __irq_do_set_handler(desc, handle, is_chained, name);
irq_put_desc_busunlock(desc, flags);
}
EXPORT_SYMBOL_GPL(__irq_set_handler);
void
+irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
+ void *data)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+
+ if (!desc)
+ return;
+
+ __irq_do_set_handler(desc, handle, 1, NULL);
+ desc->irq_data.handler_data = data;
+
+ irq_put_desc_busunlock(desc, flags);
+}
+EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
+
+void
irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
@@ -876,6 +899,34 @@ void irq_cpu_offline(void)
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
/**
+ * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
+ * NULL)
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_enable_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ if (data->chip->irq_enable)
+ data->chip->irq_enable(data);
+ else
+ data->chip->irq_unmask(data);
+}
+
+/**
+ * irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if
+ * NULL)
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_disable_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ if (data->chip->irq_disable)
+ data->chip->irq_disable(data);
+ else
+ data->chip->irq_mask(data);
+}
+
+/**
* irq_chip_ack_parent - Acknowledge the parent interrupt
* @data: Pointer to interrupt specific data
*/
@@ -934,6 +985,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
}
/**
+ * irq_chip_set_type_parent - Set IRQ type on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
+{
+ data = data->parent_data;
+
+ if (data->chip->irq_set_type)
+ return data->chip->irq_set_type(data, type);
+
+ return -ENOSYS;
+}
+
+/**
* irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
* @data: Pointer to interrupt specific data
*
@@ -946,6 +1014,36 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
if (data->chip && data->chip->irq_retrigger)
return data->chip->irq_retrigger(data);
+ return 0;
+}
+
+/**
+ * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @vcpu_info: The vcpu affinity information
+ */
+int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_vcpu_affinity)
+ return data->chip->irq_set_vcpu_affinity(data, vcpu_info);
+
+ return -ENOSYS;
+}
+
+/**
+ * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @on: Whether to set or reset the wake-up capability of this irq
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_wake)
+ return data->chip->irq_set_wake(data, on);
+
return -ENOSYS;
}
#endif
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d5d0f7345c54..74d90a754268 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -104,7 +104,7 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
return -ENOMEM;
rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
- if (rc) {
+ if (rc < 0) {
devres_free(dr);
return rc;
}
@@ -113,7 +113,7 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
dr->dev_id = dev_id;
devres_add(dev, dr);
- return 0;
+ return rc;
}
EXPORT_SYMBOL(devm_request_any_context_irq);
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 988dc58e8847..326a67f2410b 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -42,6 +42,7 @@ struct irq_chip no_irq_chip = {
.irq_enable = noop,
.irq_disable = noop,
.irq_ack = ack_bad,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
};
/*
@@ -57,5 +58,6 @@ struct irq_chip dummy_irq_chip = {
.irq_ack = noop,
.irq_mask = noop,
.irq_unmask = noop,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
};
EXPORT_SYMBOL_GPL(dummy_irq_chip);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 61024e8abdef..abd286afbd27 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -360,7 +360,7 @@ static struct lock_class_key irq_nested_lock_class;
int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
irq_hw_number_t hw_irq)
{
- struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_data *data = irq_domain_get_irq_data(d, virq);
struct irq_domain_chip_generic *dgc = d->gc;
struct irq_chip_generic *gc;
struct irq_chip_type *ct;
@@ -405,8 +405,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
else
data->mask = 1 << idx;
- irq_set_chip_and_handler(virq, chip, ct->handler);
- irq_set_chip_data(virq, gc);
+ irq_domain_set_info(d, virq, hw_irq, chip, gc, ct->handler, NULL, NULL);
irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
return 0;
}
@@ -554,6 +553,9 @@ static int irq_gc_suspend(void)
if (data)
ct->chip.irq_suspend(data);
}
+
+ if (gc->suspend)
+ gc->suspend(gc);
}
return 0;
}
@@ -565,6 +567,9 @@ static void irq_gc_resume(void)
list_for_each_entry(gc, &gc_list, list) {
struct irq_chip_type *ct = gc->chip_types;
+ if (gc->resume)
+ gc->resume(gc);
+
if (ct->chip.irq_resume) {
struct irq_data *data = irq_gc_get_irq_data(gc);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 635480270858..b6eeea8a80c5 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,7 +30,7 @@
void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
{
print_irq_desc(irq, desc);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
ack_bad_irq(irq);
}
@@ -176,7 +176,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
add_interrupt_randomness(irq, flags);
if (!noirqdebug)
- note_interrupt(irq, desc, retval);
+ note_interrupt(desc, retval);
return retval;
}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index df553b0af936..eee4b385cffb 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -59,12 +59,9 @@ enum {
#include "debug.h"
#include "settings.h"
-#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
-
-extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
- unsigned long flags);
-extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
-extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
+extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags);
+extern void __disable_irq(struct irq_desc *desc);
+extern void __enable_irq(struct irq_desc *desc);
extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
@@ -78,12 +75,8 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
-extern void irq_lock_sparse(void);
-extern void irq_unlock_sparse(void);
#else
extern void irq_mark_irq(unsigned int irq);
-static inline void irq_lock_sparse(void) { }
-static inline void irq_unlock_sparse(void) { }
#endif
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
@@ -92,7 +85,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *act
irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
-void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+void check_irq_resend(struct irq_desc *desc);
bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
@@ -170,35 +163,40 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
*/
static inline void irqd_set_move_pending(struct irq_data *d)
{
- d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+ __irqd_to_state(d) |= IRQD_SETAFFINITY_PENDING;
}
static inline void irqd_clr_move_pending(struct irq_data *d)
{
- d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+ __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING;
}
static inline void irqd_clear(struct irq_data *d, unsigned int mask)
{
- d->state_use_accessors &= ~mask;
+ __irqd_to_state(d) &= ~mask;
}
static inline void irqd_set(struct irq_data *d, unsigned int mask)
{
- d->state_use_accessors |= mask;
+ __irqd_to_state(d) |= mask;
}
static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
{
- return d->state_use_accessors & mask;
+ return __irqd_to_state(d) & mask;
}
-static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
+static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
+static inline int irq_desc_get_node(struct irq_desc *desc)
+{
+ return irq_data_get_node(&desc->irq_data);
+}
+
#ifdef CONFIG_PM_SLEEP
bool irq_pm_check_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 99793b9b6d23..0a2a4b697bcb 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -59,16 +59,10 @@ static void desc_smp_init(struct irq_desc *desc, int node)
#endif
}
-static inline int desc_node(struct irq_desc *desc)
-{
- return desc->irq_data.node;
-}
-
#else
static inline int
alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
static inline void desc_smp_init(struct irq_desc *desc, int node) { }
-static inline int desc_node(struct irq_desc *desc) { return 0; }
#endif
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
@@ -76,6 +70,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
{
int cpu;
+ desc->irq_data.common = &desc->irq_common_data;
desc->irq_data.irq = irq;
desc->irq_data.chip = &no_irq_chip;
desc->irq_data.chip_data = NULL;
@@ -299,7 +294,7 @@ static void free_desc(unsigned int irq)
unsigned long flags;
raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, desc_node(desc), NULL);
+ desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
@@ -587,7 +582,7 @@ int irq_set_percpu_devid(unsigned int irq)
void kstat_incr_irq_this_cpu(unsigned int irq)
{
- kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
+ kstat_incr_irqs_this_cpu(irq_to_desc(irq));
}
/**
@@ -619,7 +614,7 @@ unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
int cpu;
- int sum = 0;
+ unsigned int sum = 0;
if (!desc || !desc->kstat_irqs)
return 0;
@@ -639,7 +634,7 @@ unsigned int kstat_irqs(unsigned int irq)
*/
unsigned int kstat_irqs_usr(unsigned int irq)
{
- int sum;
+ unsigned int sum;
irq_lock_sparse();
sum = kstat_irqs(irq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 7fac311057b8..79baaf8a7813 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -187,10 +187,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
/**
- * irq_find_host() - Locates a domain for a given device node
+ * irq_find_matching_host() - Locates a domain for a given device node
* @node: device-tree node of the interrupt controller
+ * @bus_token: domain-specific data
*/
-struct irq_domain *irq_find_host(struct device_node *node)
+struct irq_domain *irq_find_matching_host(struct device_node *node,
+ enum irq_domain_bus_token bus_token)
{
struct irq_domain *h, *found = NULL;
int rc;
@@ -199,13 +201,19 @@ struct irq_domain *irq_find_host(struct device_node *node)
* it might potentially be set to match all interrupts in
* the absence of a device node. This isn't a problem so far
* yet though...
+ *
+ * bus_token == DOMAIN_BUS_ANY matches any domain, any other
+ * values must generate an exact match for the domain to be
+ * selected.
*/
mutex_lock(&irq_domain_mutex);
list_for_each_entry(h, &irq_domain_list, link) {
if (h->ops->match)
- rc = h->ops->match(h, node);
+ rc = h->ops->match(h, node, bus_token);
else
- rc = (h->of_node != NULL) && (h->of_node == node);
+ rc = ((h->of_node != NULL) && (h->of_node == node) &&
+ ((bus_token == DOMAIN_BUS_ANY) ||
+ (h->bus_token == bus_token)));
if (rc) {
found = h;
@@ -215,7 +223,7 @@ struct irq_domain *irq_find_host(struct device_node *node)
mutex_unlock(&irq_domain_mutex);
return found;
}
-EXPORT_SYMBOL_GPL(irq_find_host);
+EXPORT_SYMBOL_GPL(irq_find_matching_host);
/**
* irq_set_default_host() - Set a "default" irq domain
@@ -830,10 +838,12 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
{
struct irq_data *irq_data;
- irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
+ irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL,
+ irq_data_get_node(child));
if (irq_data) {
child->parent_data = irq_data;
irq_data->irq = child->irq;
+ irq_data->common = child->common;
irq_data->node = child->node;
irq_data->domain = domain;
}
@@ -1232,6 +1242,27 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
}
+/**
+ * irq_domain_set_info - Set the complete data for a @virq in @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hardware interrupt number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated interrupt chip data
+ * @handler: The interrupt flow handler
+ * @handler_data: The interrupt flow handler data
+ * @handler_name: The interrupt handler name
+ */
+void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, struct irq_chip *chip,
+ void *chip_data, irq_flow_handler_t handler,
+ void *handler_data, const char *handler_name)
+{
+ irq_set_chip_and_handler_name(virq, chip, handler, handler_name);
+ irq_set_chip_data(virq, chip_data);
+ irq_set_handler_data(virq, handler_data);
+}
+
static void irq_domain_check_hierarchy(struct irq_domain *domain)
{
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 886d09e691d5..ad1b064f94fe 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
* Do not use this for shutdown scenarios where you must be sure
* that all parts (hardirq and threaded handler) have completed.
*
+ * Returns: false if a threaded handler is active.
+ *
* This function may be called - with care - from IRQ context.
*/
-void synchronize_hardirq(unsigned int irq)
+bool synchronize_hardirq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (desc)
+ if (desc) {
__synchronize_hardirq(desc);
+ return !atomic_read(&desc->threads_active);
+ }
+
+ return true;
}
EXPORT_SYMBOL(synchronize_hardirq);
@@ -109,6 +115,14 @@ EXPORT_SYMBOL(synchronize_irq);
#ifdef CONFIG_SMP
cpumask_var_t irq_default_affinity;
+static int __irq_can_set_affinity(struct irq_desc *desc)
+{
+ if (!desc || !irqd_can_balance(&desc->irq_data) ||
+ !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
+ return 0;
+ return 1;
+}
+
/**
* irq_can_set_affinity - Check if the affinity of a given irq can be set
* @irq: Interrupt to check
@@ -116,13 +130,7 @@ cpumask_var_t irq_default_affinity;
*/
int irq_can_set_affinity(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (!desc || !irqd_can_balance(&desc->irq_data) ||
- !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
- return 0;
-
- return 1;
+ return __irq_can_set_affinity(irq_to_desc(irq));
}
/**
@@ -250,6 +258,37 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+/**
+ * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ * @irq: interrupt number to set affinity
+ * @vcpu_info: vCPU specific data
+ *
+ * This function uses the vCPU specific data to set the vCPU
+ * affinity for an irq. The vCPU specific data is passed from
+ * outside, such as KVM. One example code path is as below:
+ * KVM -> IOMMU -> irq_set_vcpu_affinity().
+ */
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ struct irq_data *data;
+ struct irq_chip *chip;
+ int ret = -ENOSYS;
+
+ if (!desc)
+ return -EINVAL;
+
+ data = irq_desc_get_irq_data(desc);
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
+ irq_put_desc_unlock(desc, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
+
static void irq_affinity_notify(struct work_struct *work)
{
struct irq_affinity_notify *notify =
@@ -322,14 +361,13 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
/*
* Generic version of the affinity autoselector.
*/
-static int
-setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
+static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
{
struct cpumask *set = irq_default_affinity;
- int node = desc->irq_data.node;
+ int node = irq_desc_get_node(desc);
/* Excludes PER_CPU and NO_BALANCE interrupts */
- if (!irq_can_set_affinity(irq))
+ if (!__irq_can_set_affinity(desc))
return 0;
/*
@@ -356,10 +394,10 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
return 0;
}
#else
-static inline int
-setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
+/* Wrapper for ALPHA specific affinity selector magic */
+static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask)
{
- return irq_select_affinity(irq);
+ return irq_select_affinity(irq_desc_get_irq(d));
}
#endif
@@ -373,20 +411,20 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
int ret;
raw_spin_lock_irqsave(&desc->lock, flags);
- ret = setup_affinity(irq, desc, mask);
+ ret = setup_affinity(desc, mask);
raw_spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
#else
static inline int
-setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
+setup_affinity(struct irq_desc *desc, struct cpumask *mask)
{
return 0;
}
#endif
-void __disable_irq(struct irq_desc *desc, unsigned int irq)
+void __disable_irq(struct irq_desc *desc)
{
if (!desc->depth++)
irq_disable(desc);
@@ -399,7 +437,7 @@ static int __disable_irq_nosync(unsigned int irq)
if (!desc)
return -EINVAL;
- __disable_irq(desc, irq);
+ __disable_irq(desc);
irq_put_desc_busunlock(desc, flags);
return 0;
}
@@ -440,12 +478,39 @@ void disable_irq(unsigned int irq)
}
EXPORT_SYMBOL(disable_irq);
-void __enable_irq(struct irq_desc *desc, unsigned int irq)
+/**
+ * disable_hardirq - disables an irq and waits for hardirq completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are
+ * nested.
+ * This function waits for any pending hard IRQ handlers for this
+ * interrupt to complete before returning. If you use this function while
+ * holding a resource the hard IRQ handler may need you will deadlock.
+ *
+ * When used to optimistically disable an interrupt from atomic context
+ * the return value must be checked.
+ *
+ * Returns: false if a threaded handler is active.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+bool disable_hardirq(unsigned int irq)
+{
+ if (!__disable_irq_nosync(irq))
+ return synchronize_hardirq(irq);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(disable_hardirq);
+
+void __enable_irq(struct irq_desc *desc)
{
switch (desc->depth) {
case 0:
err_out:
- WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+ WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n",
+ irq_desc_get_irq(desc));
break;
case 1: {
if (desc->istate & IRQS_SUSPENDED)
@@ -453,7 +518,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq)
/* Prevent probing on this irq: */
irq_settings_set_noprobe(desc);
irq_enable(desc);
- check_irq_resend(desc, irq);
+ check_irq_resend(desc);
/* fall-through */
}
default:
@@ -483,7 +548,7 @@ void enable_irq(unsigned int irq)
KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
goto out;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
out:
irq_put_desc_busunlock(desc, flags);
}
@@ -574,8 +639,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
return canrequest;
}
-int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
- unsigned long flags)
+int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
{
struct irq_chip *chip = desc->irq_data.chip;
int ret, unmask = 0;
@@ -585,7 +649,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
* IRQF_TRIGGER_* but the PIC does not support multiple
* flow-types?
*/
- pr_debug("No set_type function for IRQ %d (%s)\n", irq,
+ pr_debug("No set_type function for IRQ %d (%s)\n",
+ irq_desc_get_irq(desc),
chip ? (chip->name ? : "unknown") : "unknown");
return 0;
}
@@ -622,7 +687,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
break;
default:
pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
- flags, irq, chip->irq_set_type);
+ flags, irq_desc_get_irq(desc), chip->irq_set_type);
}
if (unmask)
unmask_irq(desc);
@@ -1158,8 +1223,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
- ret = __irq_set_trigger(desc, irq,
- new->flags & IRQF_TRIGGER_MASK);
+ ret = __irq_set_trigger(desc,
+ new->flags & IRQF_TRIGGER_MASK);
if (ret)
goto out_mask;
@@ -1190,7 +1255,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
/* Set default affinity mask once everything is setup */
- setup_affinity(irq, desc, mask);
+ setup_affinity(desc, mask);
} else if (new->flags & IRQF_TRIGGER_MASK) {
unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
@@ -1217,7 +1282,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
desc->istate &= ~IRQS_SPURIOUS_DISABLED;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1587,7 +1652,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
if (type != IRQ_TYPE_NONE) {
int ret;
- ret = __irq_set_trigger(desc, irq, type);
+ ret = __irq_set_trigger(desc, type);
if (ret) {
WARN(1, "failed to set type for IRQ%d\n", irq);
@@ -1766,3 +1831,96 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
+
+/**
+ * irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: One of IRQCHIP_STATE_* the caller wants to know about
+ * @state: a pointer to a boolean where the state is to be storeed
+ *
+ * This call snapshots the internal irqchip state of an
+ * interrupt, returning into @state the bit corresponding to
+ * stage @which
+ *
+ * This function should be called with preemption disabled if the
+ * interrupt controller has per-cpu registers.
+ */
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+ bool *state)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return err;
+
+ data = irq_desc_get_irq_data(desc);
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_get_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_get_irqchip_state(data, which, state);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
+}
+EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
+
+/**
+ * irq_set_irqchip_state - set the state of a forwarded interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
+ *
+ * This call sets the internal irqchip state of an interrupt,
+ * depending on the value of @which.
+ *
+ * This function should be called with preemption disabled if the
+ * interrupt controller has per-cpu registers.
+ */
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+ bool val)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return err;
+
+ data = irq_desc_get_irq_data(desc);
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_set_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_set_irqchip_state(data, which, val);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
+}
+EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index ca3f4aaff707..37ddb7bda651 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,21 +7,21 @@
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
- struct irq_chip *chip = idata->chip;
+ struct irq_chip *chip = desc->irq_data.chip;
if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
return;
+ irqd_clr_move_pending(&desc->irq_data);
+
/*
* Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
*/
- if (!irqd_can_balance(&desc->irq_data)) {
+ if (irqd_is_per_cpu(&desc->irq_data)) {
WARN_ON(1);
return;
}
- irqd_clr_move_pending(&desc->irq_data);
-
if (unlikely(cpumask_empty(desc->pending_mask)))
return;
@@ -52,6 +52,13 @@ void irq_move_irq(struct irq_data *idata)
{
bool masked;
+ /*
+ * Get top level irq_data when CONFIG_IRQ_DOMAIN_HIERARCHY is enabled,
+ * and it should be optimized away when CONFIG_IRQ_DOMAIN_HIERARCHY is
+ * disabled. So we avoid an "#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY" here.
+ */
+ idata = irq_desc_get_irq_data(irq_data_to_desc(idata));
+
if (likely(!irqd_is_setaffinity_pending(idata)))
return;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3e18163f336f..7e6512b9dc1f 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,6 +18,23 @@
/* Temparory solution for building, will be removed later */
#include <linux/pci.h>
+struct msi_desc *alloc_msi_entry(struct device *dev)
+{
+ struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+ if (!desc)
+ return NULL;
+
+ INIT_LIST_HEAD(&desc->list);
+ desc->dev = dev;
+
+ return desc;
+}
+
+void free_msi_entry(struct msi_desc *entry)
+{
+ kfree(entry);
+}
+
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
{
*msg = entry->msg;
@@ -124,7 +141,7 @@ static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-static struct irq_domain_ops msi_domain_ops = {
+static const struct irq_domain_ops msi_domain_ops = {
.alloc = msi_domain_alloc,
.free = msi_domain_free,
.activate = msi_domain_activate,
@@ -310,8 +327,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
struct msi_desc *desc;
for_each_msi_entry(desc, dev) {
- irq_domain_free_irqs(desc->irq, desc->nvec_used);
- desc->irq = 0;
+ /*
+ * We might have failed to allocate an MSI early
+ * enough that there is no IRQ associated to this
+ * entry. If that's the case, don't do anything.
+ */
+ if (desc->irq) {
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ desc->irq = 0;
+ }
}
}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 5204a6d1b985..21c62617a35a 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -68,7 +68,7 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
desc->cond_suspend_depth--;
}
-static bool suspend_device_irq(struct irq_desc *desc, int irq)
+static bool suspend_device_irq(struct irq_desc *desc)
{
if (!desc->action || desc->no_suspend_depth)
return false;
@@ -85,7 +85,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq)
}
desc->istate |= IRQS_SUSPENDED;
- __disable_irq(desc, irq);
+ __disable_irq(desc);
/*
* Hardware which has no wakeup source configuration facility
@@ -123,8 +123,10 @@ void suspend_device_irqs(void)
unsigned long flags;
bool sync;
+ if (irq_settings_is_nested_thread(desc))
+ continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- sync = suspend_device_irq(desc, irq);
+ sync = suspend_device_irq(desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (sync)
@@ -133,7 +135,7 @@ void suspend_device_irqs(void)
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
-static void resume_irq(struct irq_desc *desc, int irq)
+static void resume_irq(struct irq_desc *desc)
{
irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
@@ -148,7 +150,7 @@ static void resume_irq(struct irq_desc *desc, int irq)
desc->depth++;
resume:
desc->istate &= ~IRQS_SUSPENDED;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
}
static void resume_irqs(bool want_early)
@@ -163,9 +165,11 @@ static void resume_irqs(bool want_early)
if (!is_early && want_early)
continue;
+ if (irq_settings_is_nested_thread(desc))
+ continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- resume_irq(desc, irq);
+ resume_irq(desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index df2f4642d1e7..0e97c142ce40 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -241,7 +241,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long) m->private);
- seq_printf(m, "%d\n", desc->irq_data.node);
+ seq_printf(m, "%d\n", irq_desc_get_node(desc));
return 0;
}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 9065107f083e..dd95f44f99b2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -53,7 +53,7 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
*
* Is called with interrupts disabled and desc->lock held.
*/
-void check_irq_resend(struct irq_desc *desc, unsigned int irq)
+void check_irq_resend(struct irq_desc *desc)
{
/*
* We do not resend level type interrupts. Level type
@@ -74,14 +74,24 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
if (!desc->irq_data.chip->irq_retrigger ||
!desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
+ unsigned int irq = irq_desc_get_irq(desc);
+
/*
- * If the interrupt has a parent irq and runs
- * in the thread context of the parent irq,
- * retrigger the parent.
+ * If the interrupt is running in the thread
+ * context of the parent irq we need to be
+ * careful, because we cannot trigger it
+ * directly.
*/
- if (desc->parent_irq &&
- irq_settings_is_nested_thread(desc))
+ if (irq_settings_is_nested_thread(desc)) {
+ /*
+ * If the parent_irq is valid, we
+ * retrigger the parent, otherwise we
+ * do nothing.
+ */
+ if (!desc->parent_irq)
+ return;
irq = desc->parent_irq;
+ }
/* Set it pending and activate the softirq: */
set_bit(irq, irqs_resend);
tasklet_schedule(&resend_tasklet);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index e2514b0e439e..32144175458d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -60,7 +60,7 @@ bool irq_wait_for_poll(struct irq_desc *desc)
/*
* Recovery handler for misrouted interrupts.
*/
-static int try_one_irq(int irq, struct irq_desc *desc, bool force)
+static int try_one_irq(struct irq_desc *desc, bool force)
{
irqreturn_t ret = IRQ_NONE;
struct irqaction *action;
@@ -133,7 +133,7 @@ static int misrouted_irq(int irq)
if (i == irq) /* Already tried */
continue;
- if (try_one_irq(i, desc, false))
+ if (try_one_irq(desc, false))
ok = 1;
}
out:
@@ -164,7 +164,7 @@ static void poll_spurious_irqs(unsigned long dummy)
continue;
local_irq_disable();
- try_one_irq(i, desc, true);
+ try_one_irq(desc, true);
local_irq_enable();
}
out:
@@ -188,10 +188,9 @@ static inline int bad_action_ret(irqreturn_t action_ret)
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
*/
-static void
-__report_bad_irq(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
+ unsigned int irq = irq_desc_get_irq(desc);
struct irqaction *action;
unsigned long flags;
@@ -224,14 +223,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
-static void
-report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
+static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
static int count = 100;
if (count > 0) {
count--;
- __report_bad_irq(irq, desc, action_ret);
+ __report_bad_irq(desc, action_ret);
}
}
@@ -272,15 +270,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
#define SPURIOUS_DEFERRED 0x80000000
-void note_interrupt(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
{
+ unsigned int irq;
+
if (desc->istate & IRQS_POLL_INPROGRESS ||
irq_settings_is_polled(desc))
return;
if (bad_action_ret(action_ret)) {
- report_bad_irq(irq, desc, action_ret);
+ report_bad_irq(desc, action_ret);
return;
}
@@ -398,6 +397,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
desc->last_unhandled = jiffies;
}
+ irq = irq_desc_get_irq(desc);
if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
@@ -413,7 +413,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
/*
* The interrupt is stuck
*/
- __report_bad_irq(irq, desc, action_ret);
+ __report_bad_irq(desc, action_ret);
/*
* Now kill the IRQ
*/
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 9019f15deab2..f7dd15d537f9 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
}
-static void jump_label_update(struct static_key *key, int enable);
+static void jump_label_update(struct static_key *key);
void static_key_slow_inc(struct static_key *key)
{
@@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key)
return;
jump_label_lock();
- if (atomic_read(&key->enabled) == 0) {
- if (!jump_label_get_branch_default(key))
- jump_label_update(key, JUMP_LABEL_ENABLE);
- else
- jump_label_update(key, JUMP_LABEL_DISABLE);
- }
- atomic_inc(&key->enabled);
+ if (atomic_inc_return(&key->enabled) == 1)
+ jump_label_update(key);
jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key,
atomic_inc(&key->enabled);
schedule_delayed_work(work, rate_limit);
} else {
- if (!jump_label_get_branch_default(key))
- jump_label_update(key, JUMP_LABEL_DISABLE);
- else
- jump_label_update(key, JUMP_LABEL_ENABLE);
+ jump_label_update(key);
}
jump_label_unlock();
}
@@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
return 0;
}
-/*
+/*
* Update code which is definitely not currently executing.
* Architectures which need heavyweight synchronization to modify
* running code can override this to make the non-live update case
@@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
- arch_jump_label_transform(entry, type);
+ arch_jump_label_transform(entry, type);
+}
+
+static inline struct jump_entry *static_key_entries(struct static_key *key)
+{
+ return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+}
+
+static inline bool static_key_type(struct static_key *key)
+{
+ return (unsigned long)key->entries & JUMP_TYPE_MASK;
+}
+
+static inline struct static_key *jump_entry_key(struct jump_entry *entry)
+{
+ return (struct static_key *)((unsigned long)entry->key & ~1UL);
+}
+
+static bool jump_entry_branch(struct jump_entry *entry)
+{
+ return (unsigned long)entry->key & 1UL;
+}
+
+static enum jump_label_type jump_label_type(struct jump_entry *entry)
+{
+ struct static_key *key = jump_entry_key(entry);
+ bool enabled = static_key_enabled(key);
+ bool branch = jump_entry_branch(entry);
+
+ /* See the comment in linux/jump_label.h */
+ return enabled ^ branch;
}
static void __jump_label_update(struct static_key *key,
struct jump_entry *entry,
- struct jump_entry *stop, int enable)
+ struct jump_entry *stop)
{
- for (; (entry < stop) &&
- (entry->key == (jump_label_t)(unsigned long)key);
- entry++) {
+ for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
/*
* entry->code set to 0 invalidates module init text sections
* kernel_text_address() verifies we are not in core kernel
* init code, see jump_label_invalidate_module_init().
*/
if (entry->code && kernel_text_address(entry->code))
- arch_jump_label_transform(entry, enable);
+ arch_jump_label_transform(entry, jump_label_type(entry));
}
}
-static enum jump_label_type jump_label_type(struct static_key *key)
-{
- bool true_branch = jump_label_get_branch_default(key);
- bool state = static_key_enabled(key);
-
- if ((!true_branch && state) || (true_branch && !state))
- return JUMP_LABEL_ENABLE;
-
- return JUMP_LABEL_DISABLE;
-}
-
void __init jump_label_init(void)
{
struct jump_entry *iter_start = __start___jump_table;
@@ -202,8 +211,11 @@ void __init jump_label_init(void)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
- iterk = (struct static_key *)(unsigned long)iter->key;
- arch_jump_label_transform_static(iter, jump_label_type(iterk));
+ /* rewrite NOPs */
+ if (jump_label_type(iter) == JUMP_LABEL_NOP)
+ arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
+
+ iterk = jump_entry_key(iter);
if (iterk == key)
continue;
@@ -222,6 +234,16 @@ void __init jump_label_init(void)
#ifdef CONFIG_MODULES
+static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+{
+ struct static_key *key = jump_entry_key(entry);
+ bool type = static_key_type(key);
+ bool branch = jump_entry_branch(entry);
+
+ /* See the comment in linux/jump_label.h */
+ return type ^ branch;
+}
+
struct static_key_mod {
struct static_key_mod *next;
struct jump_entry *entries;
@@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
start, end);
}
-static void __jump_label_mod_update(struct static_key *key, int enable)
+static void __jump_label_mod_update(struct static_key *key)
{
- struct static_key_mod *mod = key->next;
+ struct static_key_mod *mod;
- while (mod) {
+ for (mod = key->next; mod; mod = mod->next) {
struct module *m = mod->mod;
__jump_label_update(key, mod->entries,
- m->jump_entries + m->num_jump_entries,
- enable);
- mod = mod->next;
+ m->jump_entries + m->num_jump_entries);
}
}
@@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod)
return;
for (iter = iter_start; iter < iter_stop; iter++) {
- arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+ /* Only write NOPs for arch_branch_static(). */
+ if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
+ arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
}
}
@@ -297,12 +319,12 @@ static int jump_label_add_module(struct module *mod)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
- iterk = (struct static_key *)(unsigned long)iter->key;
+ iterk = jump_entry_key(iter);
if (iterk == key)
continue;
key = iterk;
- if (__module_address(iter->key) == mod) {
+ if (within_module(iter->key, mod)) {
/*
* Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
*/
@@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod)
jlm->next = key->next;
key->next = jlm;
- if (jump_label_type(key) == JUMP_LABEL_ENABLE)
- __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+ /* Only update if we've changed from our initial state */
+ if (jump_label_type(iter) != jump_label_init_type(iter))
+ __jump_label_update(key, iter, iter_stop);
}
return 0;
@@ -334,12 +357,12 @@ static void jump_label_del_module(struct module *mod)
struct static_key_mod *jlm, **prev;
for (iter = iter_start; iter < iter_stop; iter++) {
- if (iter->key == (jump_label_t)(unsigned long)key)
+ if (jump_entry_key(iter) == key)
continue;
- key = (struct static_key *)(unsigned long)iter->key;
+ key = jump_entry_key(iter);
- if (__module_address(iter->key) == mod)
+ if (within_module(iter->key, mod))
continue;
prev = &key->next;
@@ -439,22 +462,61 @@ int jump_label_text_reserved(void *start, void *end)
return ret;
}
-static void jump_label_update(struct static_key *key, int enable)
+static void jump_label_update(struct static_key *key)
{
struct jump_entry *stop = __stop___jump_table;
- struct jump_entry *entry = jump_label_get_entries(key);
-
+ struct jump_entry *entry = static_key_entries(key);
#ifdef CONFIG_MODULES
- struct module *mod = __module_address((unsigned long)key);
+ struct module *mod;
- __jump_label_mod_update(key, enable);
+ __jump_label_mod_update(key);
+ preempt_disable();
+ mod = __module_address((unsigned long)key);
if (mod)
stop = mod->jump_entries + mod->num_jump_entries;
+ preempt_enable();
#endif
/* if there are no users, entry can be NULL */
if (entry)
- __jump_label_update(key, entry, stop, enable);
+ __jump_label_update(key, entry, stop);
}
-#endif
+#ifdef CONFIG_STATIC_KEYS_SELFTEST
+static DEFINE_STATIC_KEY_TRUE(sk_true);
+static DEFINE_STATIC_KEY_FALSE(sk_false);
+
+static __init int jump_label_test(void)
+{
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ WARN_ON(static_key_enabled(&sk_true.key) != true);
+ WARN_ON(static_key_enabled(&sk_false.key) != false);
+
+ WARN_ON(!static_branch_likely(&sk_true));
+ WARN_ON(!static_branch_unlikely(&sk_true));
+ WARN_ON(static_branch_likely(&sk_false));
+ WARN_ON(static_branch_unlikely(&sk_false));
+
+ static_branch_disable(&sk_true);
+ static_branch_enable(&sk_false);
+
+ WARN_ON(static_key_enabled(&sk_true.key) == true);
+ WARN_ON(static_key_enabled(&sk_false.key) == false);
+
+ WARN_ON(static_branch_likely(&sk_true));
+ WARN_ON(static_branch_unlikely(&sk_true));
+ WARN_ON(!static_branch_likely(&sk_false));
+ WARN_ON(!static_branch_unlikely(&sk_false));
+
+ static_branch_enable(&sk_true);
+ static_branch_disable(&sk_false);
+ }
+
+ return 0;
+}
+late_initcall(jump_label_test);
+#endif /* STATIC_KEYS_SELFTEST */
+
+#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 38c25b1f2fd5..a785c1015e25 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -84,6 +84,17 @@ struct resource crashk_low_res = {
int kexec_should_crash(struct task_struct *p)
{
+ /*
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
+ */
+ if (crash_kexec_post_notifiers)
+ return 0;
+ /*
+ * There are 4 panic() calls in do_exit() path, each of which
+ * corresponds to each of these 4 conditions.
+ */
if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
return 1;
return 0;
@@ -707,7 +718,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
do {
unsigned long pfn, epfn, addr, eaddr;
- pages = kimage_alloc_pages(GFP_KERNEL, order);
+ pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
if (!pages)
break;
pfn = page_to_pfn(pages);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c90e417bb963..d10ab6b9b5e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
addr < (unsigned long)__kprobes_text_end;
}
-static bool within_kprobe_blacklist(unsigned long addr)
+bool within_kprobe_blacklist(unsigned long addr)
{
struct kprobe_blacklist_entry *ent;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 10e489c448fe..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -97,6 +97,7 @@ bool kthread_should_park(void)
{
return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
}
+EXPORT_SYMBOL_GPL(kthread_should_park);
/**
* kthread_freezable_should_stop - should this freezable kthread return now?
@@ -171,6 +172,7 @@ void kthread_parkme(void)
{
__kthread_parkme(to_kthread(current));
}
+EXPORT_SYMBOL_GPL(kthread_parkme);
static int kthread(void *_create)
{
@@ -246,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
* kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
- * it. See also kthread_run().
+ * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
*
* If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* standalone thread for which no one will call kthread_stop(), or
@@ -325,16 +328,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
+ unsigned long flags;
+
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+
/* It's safe because the task is inactive. */
- do_set_cpus_allowed(p, cpumask_of(cpu));
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+ __kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+ __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
}
/**
@@ -411,6 +428,7 @@ void kthread_unpark(struct task_struct *k)
if (kthread)
__kthread_unpark(k, kthread);
}
+EXPORT_SYMBOL_GPL(kthread_unpark);
/**
* kthread_park - park a thread created by kthread_create().
@@ -441,6 +459,7 @@ int kthread_park(struct task_struct *k)
}
return ret;
}
+EXPORT_SYMBOL_GPL(kthread_park);
/**
* kthread_stop - stop a thread created by kthread_create().
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 3f9f1d6b4c2e..6e5344112419 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -128,7 +128,7 @@ static bool klp_is_patch_registered(struct klp_patch *patch)
static bool klp_initialized(void)
{
- return klp_root_kobj;
+ return !!klp_root_kobj;
}
struct klp_find_arg {
@@ -179,7 +179,9 @@ static int klp_find_object_symbol(const char *objname, const char *name,
.count = 0
};
+ mutex_lock(&module_mutex);
kallsyms_on_each_symbol(klp_find_callback, &args);
+ mutex_unlock(&module_mutex);
if (args.count == 0)
pr_err("symbol '%s' not found in symbol table\n", name);
@@ -219,13 +221,19 @@ static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
.name = name,
.addr = addr,
};
+ int ret;
- if (kallsyms_on_each_symbol(klp_verify_callback, &args))
- return 0;
+ mutex_lock(&module_mutex);
+ ret = kallsyms_on_each_symbol(klp_verify_callback, &args);
+ mutex_unlock(&module_mutex);
- pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
- name, addr);
- return -EINVAL;
+ if (!ret) {
+ pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
+ name, addr);
+ return -EINVAL;
+ }
+
+ return 0;
}
static int klp_find_verify_func_addr(struct klp_object *obj,
@@ -234,8 +242,9 @@ static int klp_find_verify_func_addr(struct klp_object *obj,
int ret;
#if defined(CONFIG_RANDOMIZE_BASE)
- /* KASLR is enabled, disregard old_addr from user */
- func->old_addr = 0;
+ /* If KASLR has been enabled, adjust old_addr accordingly */
+ if (kaslr_enabled() && func->old_addr)
+ func->old_addr += kaslr_offset();
#endif
if (!func->old_addr || klp_is_module(obj))
@@ -335,32 +344,22 @@ unlock:
rcu_read_unlock();
}
-static int klp_disable_func(struct klp_func *func)
+static void klp_disable_func(struct klp_func *func)
{
struct klp_ops *ops;
- int ret;
if (WARN_ON(func->state != KLP_ENABLED))
- return -EINVAL;
-
+ return;
if (WARN_ON(!func->old_addr))
- return -EINVAL;
+ return;
ops = klp_find_ops(func->old_addr);
if (WARN_ON(!ops))
- return -EINVAL;
+ return;
if (list_is_singular(&ops->func_stack)) {
- ret = unregister_ftrace_function(&ops->fops);
- if (ret) {
- pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
- func->old_name, ret);
- return ret;
- }
-
- ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
- if (ret)
- pr_warn("function unregister succeeded but failed to clear the filter\n");
+ WARN_ON(unregister_ftrace_function(&ops->fops));
+ WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
list_del_rcu(&func->stack_node);
list_del(&ops->node);
@@ -370,8 +369,6 @@ static int klp_disable_func(struct klp_func *func)
}
func->state = KLP_DISABLED;
-
- return 0;
}
static int klp_enable_func(struct klp_func *func)
@@ -432,23 +429,15 @@ err:
return ret;
}
-static int klp_disable_object(struct klp_object *obj)
+static void klp_disable_object(struct klp_object *obj)
{
struct klp_func *func;
- int ret;
- for (func = obj->funcs; func->old_name; func++) {
- if (func->state != KLP_ENABLED)
- continue;
-
- ret = klp_disable_func(func);
- if (ret)
- return ret;
- }
+ klp_for_each_func(obj, func)
+ if (func->state == KLP_ENABLED)
+ klp_disable_func(func);
obj->state = KLP_DISABLED;
-
- return 0;
}
static int klp_enable_object(struct klp_object *obj)
@@ -462,24 +451,21 @@ static int klp_enable_object(struct klp_object *obj)
if (WARN_ON(!klp_is_object_loaded(obj)))
return -EINVAL;
- for (func = obj->funcs; func->old_name; func++) {
+ klp_for_each_func(obj, func) {
ret = klp_enable_func(func);
- if (ret)
- goto unregister;
+ if (ret) {
+ klp_disable_object(obj);
+ return ret;
+ }
}
obj->state = KLP_ENABLED;
return 0;
-
-unregister:
- WARN_ON(klp_disable_object(obj));
- return ret;
}
static int __klp_disable_patch(struct klp_patch *patch)
{
struct klp_object *obj;
- int ret;
/* enforce stacking: only the last enabled patch can be disabled */
if (!list_is_last(&patch->list, &klp_patches) &&
@@ -488,13 +474,9 @@ static int __klp_disable_patch(struct klp_patch *patch)
pr_notice("disabling patch '%s'\n", patch->mod->name);
- for (obj = patch->objs; obj->funcs; obj++) {
- if (obj->state != KLP_ENABLED)
- continue;
-
- ret = klp_disable_object(obj);
- if (ret)
- return ret;
+ klp_for_each_object(patch, obj) {
+ if (obj->state == KLP_ENABLED)
+ klp_disable_object(obj);
}
patch->state = KLP_DISABLED;
@@ -552,9 +534,7 @@ static int __klp_enable_patch(struct klp_patch *patch)
pr_notice("enabling patch '%s'\n", patch->mod->name);
- for (obj = patch->objs; obj->funcs; obj++) {
- klp_find_object_module(obj);
-
+ klp_for_each_object(patch, obj) {
if (!klp_is_object_loaded(obj))
continue;
@@ -682,6 +662,15 @@ static struct kobj_type klp_ktype_patch = {
.default_attrs = klp_patch_attrs,
};
+static void klp_kobj_release_object(struct kobject *kobj)
+{
+}
+
+static struct kobj_type klp_ktype_object = {
+ .release = klp_kobj_release_object,
+ .sysfs_ops = &kobj_sysfs_ops,
+};
+
static void klp_kobj_release_func(struct kobject *kobj)
{
}
@@ -711,7 +700,7 @@ static void klp_free_object_loaded(struct klp_object *obj)
obj->mod = NULL;
- for (func = obj->funcs; func->old_name; func++)
+ klp_for_each_func(obj, func)
func->old_addr = 0;
}
@@ -726,7 +715,7 @@ static void klp_free_objects_limited(struct klp_patch *patch,
for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
klp_free_funcs_limited(obj, NULL);
- kobject_put(obj->kobj);
+ kobject_put(&obj->kobj);
}
}
@@ -744,7 +733,7 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
func->state = KLP_DISABLED;
return kobject_init_and_add(&func->kobj, &klp_ktype_func,
- obj->kobj, "%s", func->old_name);
+ &obj->kobj, "%s", func->old_name);
}
/* parts of the initialization that is done only when the object is loaded */
@@ -760,7 +749,7 @@ static int klp_init_object_loaded(struct klp_patch *patch,
return ret;
}
- for (func = obj->funcs; func->old_name; func++) {
+ klp_for_each_func(obj, func) {
ret = klp_find_verify_func_addr(obj, func);
if (ret)
return ret;
@@ -784,11 +773,12 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
klp_find_object_module(obj);
name = klp_is_module(obj) ? obj->name : "vmlinux";
- obj->kobj = kobject_create_and_add(name, &patch->kobj);
- if (!obj->kobj)
- return -ENOMEM;
+ ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object,
+ &patch->kobj, "%s", name);
+ if (ret)
+ return ret;
- for (func = obj->funcs; func->old_name; func++) {
+ klp_for_each_func(obj, func) {
ret = klp_init_func(obj, func);
if (ret)
goto free;
@@ -804,7 +794,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
free:
klp_free_funcs_limited(obj, func);
- kobject_put(obj->kobj);
+ kobject_put(&obj->kobj);
return ret;
}
@@ -825,7 +815,7 @@ static int klp_init_patch(struct klp_patch *patch)
if (ret)
goto unlock;
- for (obj = patch->objs; obj->funcs; obj++) {
+ klp_for_each_object(patch, obj) {
ret = klp_init_object(patch, obj);
if (ret)
goto free;
@@ -914,7 +904,7 @@ int klp_register_patch(struct klp_patch *patch)
}
EXPORT_SYMBOL_GPL(klp_register_patch);
-static void klp_module_notify_coming(struct klp_patch *patch,
+static int klp_module_notify_coming(struct klp_patch *patch,
struct klp_object *obj)
{
struct module *pmod = patch->mod;
@@ -922,22 +912,23 @@ static void klp_module_notify_coming(struct klp_patch *patch,
int ret;
ret = klp_init_object_loaded(patch, obj);
- if (ret)
- goto err;
+ if (ret) {
+ pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n",
+ pmod->name, mod->name, ret);
+ return ret;
+ }
if (patch->state == KLP_DISABLED)
- return;
+ return 0;
pr_notice("applying patch '%s' to loading module '%s'\n",
pmod->name, mod->name);
ret = klp_enable_object(obj);
- if (!ret)
- return;
-
-err:
- pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
- pmod->name, mod->name, ret);
+ if (ret)
+ pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+ pmod->name, mod->name, ret);
+ return ret;
}
static void klp_module_notify_going(struct klp_patch *patch,
@@ -945,7 +936,6 @@ static void klp_module_notify_going(struct klp_patch *patch,
{
struct module *pmod = patch->mod;
struct module *mod = obj->mod;
- int ret;
if (patch->state == KLP_DISABLED)
goto disabled;
@@ -953,10 +943,7 @@ static void klp_module_notify_going(struct klp_patch *patch,
pr_notice("reverting patch '%s' on unloading module '%s'\n",
pmod->name, mod->name);
- ret = klp_disable_object(obj);
- if (ret)
- pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
- pmod->name, mod->name, ret);
+ klp_disable_object(obj);
disabled:
klp_free_object_loaded(obj);
@@ -965,6 +952,7 @@ disabled:
static int klp_module_notify(struct notifier_block *nb, unsigned long action,
void *data)
{
+ int ret;
struct module *mod = data;
struct klp_patch *patch;
struct klp_object *obj;
@@ -984,13 +972,18 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action,
mod->klp_alive = false;
list_for_each_entry(patch, &klp_patches, list) {
- for (obj = patch->objs; obj->funcs; obj++) {
+ klp_for_each_object(patch, obj) {
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
if (action == MODULE_STATE_COMING) {
obj->mod = mod;
- klp_module_notify_coming(patch, obj);
+ ret = klp_module_notify_coming(patch, obj);
+ if (ret) {
+ obj->mod = NULL;
+ pr_warn("patch '%s' is in an inconsistent state!\n",
+ patch->mod->name);
+ }
} else /* MODULE_STATE_GOING */
klp_module_notify_going(patch, obj);
@@ -1008,7 +1001,7 @@ static struct notifier_block klp_module_nb = {
.priority = INT_MIN+1, /* called late but before ftrace notifier */
};
-static int klp_init(void)
+static int __init klp_init(void)
{
int ret;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index de7a416cca2a..8e96f6cc2a4a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -17,13 +17,12 @@ obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
-obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
-obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..951cfcd10b4a 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
}
EXPORT_SYMBOL(lg_local_unlock_cpu);
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
+{
+ BUG_ON(cpu1 == cpu2);
+
+ /* lock in cpu order, just like lg_global_lock */
+ if (cpu2 < cpu1)
+ swap(cpu1, cpu2);
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
+}
+
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
+{
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
+ preempt_enable();
+}
+
void lg_global_lock(struct lglock *lg)
{
int i;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index ba77ab5f64dd..8acfbf773e06 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -551,7 +551,21 @@ static void print_lockdep_cache(struct lockdep_map *lock)
static void print_lock(struct held_lock *hlock)
{
- print_lock_name(hlock_class(hlock));
+ /*
+ * We can be called locklessly through debug_show_all_locks() so be
+ * extra careful, the hlock might have been released and cleared.
+ */
+ unsigned int class_idx = hlock->class_idx;
+
+ /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
+ barrier();
+
+ if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+ printk("<RELEASED>\n");
+ return;
+ }
+
+ print_lock_name(lock_classes + class_idx - 1);
printk(", at: ");
print_ip_sym(hlock->acquire_ip);
}
@@ -3143,6 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->waittime_stamp = 0;
hlock->holdtime_stamp = lockstat_clock();
#endif
+ hlock->pin_count = 0;
if (check && !mark_irqflags(curr, hlock))
return 0;
@@ -3246,26 +3261,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
return 0;
}
-/*
- * Common debugging checks for both nested and non-nested unlock:
- */
-static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
-{
- if (unlikely(!debug_locks))
- return 0;
- /*
- * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
- */
- if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
- return 0;
-
- if (curr->lockdep_depth <= 0)
- return print_unlock_imbalance_bug(curr, lock, ip);
-
- return 1;
-}
-
static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
{
if (hlock->instance == lock)
@@ -3362,31 +3357,35 @@ found_it:
}
/*
- * Remove the lock to the list of currently held locks in a
- * potentially non-nested (out of order) manner. This is a
- * relatively rare operation, as all the unlock APIs default
- * to nested mode (which uses lock_release()):
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()).
+ *
+ * @nested is an hysterical artifact, needs a tree wide cleanup.
*/
static int
-lock_release_non_nested(struct task_struct *curr,
- struct lockdep_map *lock, unsigned long ip)
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
{
+ struct task_struct *curr = current;
struct held_lock *hlock, *prev_hlock;
unsigned int depth;
int i;
- /*
- * Check whether the lock exists in the current stack
- * of held locks:
- */
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* So we're all set to release this lock.. wait what lock? We don't
* own any locks, you've been drinking again?
*/
- if (DEBUG_LOCKS_WARN_ON(!depth))
- return 0;
+ if (DEBUG_LOCKS_WARN_ON(depth <= 0))
+ return print_unlock_imbalance_bug(curr, lock, ip);
+ /*
+ * Check whether the lock exists in the current stack
+ * of held locks:
+ */
prev_hlock = NULL;
for (i = depth-1; i >= 0; i--) {
hlock = curr->held_locks + i;
@@ -3405,6 +3404,8 @@ found_it:
if (hlock->instance == lock)
lock_release_holdtime(hlock);
+ WARN(hlock->pin_count, "releasing a pinned lock\n");
+
if (hlock->references) {
hlock->references--;
if (hlock->references) {
@@ -3442,91 +3443,66 @@ found_it:
*/
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
return 0;
+
return 1;
}
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static int lock_release_nested(struct task_struct *curr,
- struct lockdep_map *lock, unsigned long ip)
+static int __lock_is_held(struct lockdep_map *lock)
{
- struct held_lock *hlock;
- unsigned int depth;
-
- /*
- * Pop off the top of the lock stack:
- */
- depth = curr->lockdep_depth - 1;
- hlock = curr->held_locks + depth;
-
- /*
- * Is the unlock non-nested:
- */
- if (hlock->instance != lock || hlock->references)
- return lock_release_non_nested(curr, lock, ip);
- curr->lockdep_depth--;
-
- /*
- * No more locks, but somehow we've got hash left over, who left it?
- */
- if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
- return 0;
+ struct task_struct *curr = current;
+ int i;
- curr->curr_chain_key = hlock->prev_chain_key;
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
- lock_release_holdtime(hlock);
+ if (match_held_lock(hlock, lock))
+ return 1;
+ }
-#ifdef CONFIG_DEBUG_LOCKDEP
- hlock->prev_chain_key = 0;
- hlock->class_idx = 0;
- hlock->acquire_ip = 0;
- hlock->irq_context = 0;
-#endif
- return 1;
+ return 0;
}
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static void
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+static void __lock_pin_lock(struct lockdep_map *lock)
{
struct task_struct *curr = current;
+ int i;
- if (!check_unlock(curr, lock, ip))
+ if (unlikely(!debug_locks))
return;
- if (nested) {
- if (!lock_release_nested(curr, lock, ip))
- return;
- } else {
- if (!lock_release_non_nested(curr, lock, ip))
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ hlock->pin_count++;
return;
+ }
}
- check_chain_key(curr);
+ WARN(1, "pinning an unheld lock\n");
}
-static int __lock_is_held(struct lockdep_map *lock)
+static void __lock_unpin_lock(struct lockdep_map *lock)
{
struct task_struct *curr = current;
int i;
+ if (unlikely(!debug_locks))
+ return;
+
for (i = 0; i < curr->lockdep_depth; i++) {
struct held_lock *hlock = curr->held_locks + i;
- if (match_held_lock(hlock, lock))
- return 1;
+ if (match_held_lock(hlock, lock)) {
+ if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
+ return;
+
+ hlock->pin_count--;
+ return;
+ }
}
- return 0;
+ WARN(1, "unpinning an unheld lock\n");
}
/*
@@ -3625,7 +3601,8 @@ void lock_release(struct lockdep_map *lock, int nested,
check_flags(flags);
current->lockdep_recursion = 1;
trace_lock_release(lock, ip);
- __lock_release(lock, nested, ip);
+ if (__lock_release(lock, nested, ip))
+ check_chain_key(current);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
@@ -3651,6 +3628,40 @@ int lock_is_held(struct lockdep_map *lock)
}
EXPORT_SYMBOL_GPL(lock_is_held);
+void lock_pin_lock(struct lockdep_map *lock)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_pin_lock(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_pin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_unpin_lock(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_unpin_lock);
+
void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
{
current->lockdep_reclaim_gfp = gfp_mask;
@@ -3886,7 +3897,8 @@ static void zap_class(struct lock_class *class)
list_del_rcu(&class->hash_entry);
list_del_rcu(&class->lock_entry);
- class->key = NULL;
+ RCU_INIT_POINTER(class->key, NULL);
+ RCU_INIT_POINTER(class->name, NULL);
}
static inline int within(const void *addr, void *start, unsigned long size)
@@ -4052,8 +4064,7 @@ void __init lockdep_info(void)
#ifdef CONFIG_DEBUG_LOCKDEP
if (lockdep_init_error) {
- printk("WARNING: lockdep init error! lock-%s was acquired"
- "before lockdep_init\n", lock_init_error);
+ printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
printk("Call stack leading to lockdep invocation was:\n");
print_stack_trace(&lockdep_init_trace, 0);
}
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index ef43ac4bafb5..d83d798bef95 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -426,10 +426,12 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
{
- char name[39];
- struct lock_class *class;
+ struct lockdep_subclass_key *ckey;
struct lock_class_stats *stats;
+ struct lock_class *class;
+ const char *cname;
int i, namelen;
+ char name[39];
class = data->class;
stats = &data->stats;
@@ -440,15 +442,25 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
if (class->subclass)
namelen -= 2;
- if (!class->name) {
+ rcu_read_lock_sched();
+ cname = rcu_dereference_sched(class->name);
+ ckey = rcu_dereference_sched(class->key);
+
+ if (!cname && !ckey) {
+ rcu_read_unlock_sched();
+ return;
+
+ } else if (!cname) {
char str[KSYM_NAME_LEN];
const char *key_name;
- key_name = __get_key_name(class->key, str);
+ key_name = __get_key_name(ckey, str);
snprintf(name, namelen, "%s", key_name);
} else {
- snprintf(name, namelen, "%s", class->name);
+ snprintf(name, namelen, "%s", cname);
}
+ rcu_read_unlock_sched();
+
namelen = strlen(name);
if (class->name_version > 1) {
snprintf(name+namelen, 3, "#%d", class->name_version);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ec8cce259779..32244186f1f2 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -122,12 +122,12 @@ static int torture_lock_busted_write_lock(void)
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_us = 100;
+ const unsigned long longdelay_ms = 100;
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_us)))
- mdelay(longdelay_us);
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
preempt_schedule(); /* Allow test to be preempted. */
@@ -160,14 +160,14 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_us = 100;
+ const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_us)))
- mdelay(longdelay_us);
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2 * shortdelay_us)))
udelay(shortdelay_us);
@@ -309,7 +309,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
static void torture_rwlock_read_unlock_irq(void)
__releases(torture_rwlock)
{
- write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+ read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
}
static struct lock_torture_ops rw_lock_irq_ops = {
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index d1fe2ba5bac9..fd91aaa4554c 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -17,6 +17,7 @@
struct mcs_spinlock {
struct mcs_spinlock *next;
int locked; /* 1 if lock acquired */
+ int count; /* nesting count, see qspinlock.c */
};
#ifndef arch_mcs_spin_lock_contended
@@ -78,7 +79,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
*/
return;
}
- ACCESS_ONCE(prev->next) = node;
+ WRITE_ONCE(prev->next, node);
/* Wait until the lock holder passes the lock down. */
arch_mcs_spin_lock_contended(&node->locked);
@@ -91,7 +92,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
static inline
void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
- struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+ struct mcs_spinlock *next = READ_ONCE(node->next);
if (likely(!next)) {
/*
@@ -100,7 +101,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
if (likely(cmpxchg(lock, node, NULL) == node))
return;
/* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
+ while (!(next = READ_ONCE(node->next)))
cpu_relax_lowlatency();
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 94674e5919cb..4cccea6b8934 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,7 +25,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock,
}
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
- if (lock->owner != owner)
- return false;
-
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * lock->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
- */
- barrier();
-
- return owner->on_cpu;
-}
-
/*
* Look out! "owner" is an entirely speculative pointer
* access and not reliable.
*/
static noinline
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
{
+ bool ret = true;
+
rcu_read_lock();
- while (owner_running(lock, owner)) {
- if (need_resched())
+ while (lock->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking lock->owner still matches owner. If that fails,
+ * owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ if (!owner->on_cpu || need_resched()) {
+ ret = false;
break;
+ }
cpu_relax_lowlatency();
}
rcu_read_unlock();
- /*
- * We break out the loop above on need_resched() and when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when lock->owner is NULL.
- */
- return lock->owner == NULL;
+ return ret;
}
/*
@@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
return 0;
rcu_read_lock();
- owner = ACCESS_ONCE(lock->owner);
+ owner = READ_ONCE(lock->owner);
if (owner)
retval = owner->on_cpu;
rcu_read_unlock();
@@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
* As such, when deadlock detection needs to be
* performed the optimistic spinning cannot be done.
*/
- if (ACCESS_ONCE(ww->ctx))
+ if (READ_ONCE(ww->ctx))
break;
}
@@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
- owner = ACCESS_ONCE(lock->owner);
+ owner = READ_ONCE(lock->owner);
if (owner && !mutex_spin_on_owner(lock, owner))
break;
@@ -490,7 +481,7 @@ static inline int __sched
__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
{
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
if (!hold_ctx)
return 0;
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index c112d00341b0..dc85ee23a26f 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
- ACCESS_ONCE(prev->next) = node;
+ WRITE_ONCE(prev->next, node);
/*
* Normally @prev is untouchable after the above store; because at that
@@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* cmpxchg in an attempt to undo our queueing.
*/
- while (!ACCESS_ONCE(node->locked)) {
+ while (!READ_ONCE(node->locked)) {
/*
* If we need to reschedule bail... so we can block.
*/
@@ -148,7 +148,7 @@ unqueue:
* Or we race against a concurrent unqueue()'s step-B, in which
* case its step-C will write us a new @node->prev pointer.
*/
- prev = ACCESS_ONCE(node->prev);
+ prev = READ_ONCE(node->prev);
}
/*
@@ -170,8 +170,8 @@ unqueue:
* it will wait in Step-A.
*/
- ACCESS_ONCE(next->prev) = prev;
- ACCESS_ONCE(prev->next) = next;
+ WRITE_ONCE(next->prev, prev);
+ WRITE_ONCE(prev->next, next);
return false;
}
@@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
node = this_cpu_ptr(&osq_node);
next = xchg(&node->next, NULL);
if (next) {
- ACCESS_ONCE(next->locked) = 1;
+ WRITE_ONCE(next->locked, 1);
return;
}
next = osq_wait_next(lock, node, NULL);
if (next)
- ACCESS_ONCE(next->locked) = 1;
+ WRITE_ONCE(next->locked, 1);
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 652a8ee8efe9..f32567254867 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
__up_read(&brw->rw_sem);
}
+int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
+{
+ if (unlikely(!update_fast_ctr(brw, +1))) {
+ if (!__down_read_trylock(&brw->rw_sem))
+ return 0;
+ atomic_inc(&brw->slow_read_ctr);
+ __up_read(&brw->rw_sem);
+ }
+
+ rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+ return 1;
+}
+
void percpu_up_read(struct percpu_rw_semaphore *brw)
{
rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f956ede7f90d..f17a3e3b3550 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,5 +1,5 @@
/*
- * Queue read/write lock
+ * Queued read/write locks
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -22,6 +22,26 @@
#include <linux/hardirq.h>
#include <asm/qrwlock.h>
+/*
+ * This internal data structure is used for optimizing access to some of
+ * the subfields within the atomic_t cnts.
+ */
+struct __qrwlock {
+ union {
+ atomic_t cnts;
+ struct {
+#ifdef __LITTLE_ENDIAN
+ u8 wmode; /* Writer mode */
+ u8 rcnts[3]; /* Reader counts */
+#else
+ u8 rcnts[3]; /* Reader counts */
+ u8 wmode; /* Writer mode */
+#endif
+ };
+ };
+ arch_spinlock_t lock;
+};
+
/**
* rspin_until_writer_unlock - inc reader count & spin until writer is gone
* @lock : Pointer to queue rwlock structure
@@ -35,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
{
while ((cnts & _QW_WMASK) == _QW_LOCKED) {
cpu_relax_lowlatency();
- cnts = smp_load_acquire((u32 *)&lock->cnts);
+ cnts = atomic_read_acquire(&lock->cnts);
}
}
/**
- * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
+ * @cnts: Current qrwlock lock value
*/
-void queue_read_lock_slowpath(struct qrwlock *lock)
+void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
{
- u32 cnts;
-
/*
* Readers come here when they cannot get the lock without waiting
*/
if (unlikely(in_interrupt())) {
/*
- * Readers in interrupt context will spin until the lock is
- * available without waiting in the queue.
+ * Readers in interrupt context will get the lock immediately
+ * if the writer is just waiting (not holding the lock yet).
+ * The rspin_until_writer_unlock() function returns immediately
+ * in this case. Otherwise, they will spin (with ACQUIRE
+ * semantics) until the lock is available without waiting in
+ * the queue.
*/
- cnts = smp_load_acquire((u32 *)&lock->cnts);
rspin_until_writer_unlock(lock, cnts);
return;
}
@@ -67,16 +89,11 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
arch_spin_lock(&lock->lock);
/*
- * At the head of the wait queue now, wait until the writer state
- * goes to 0 and then try to increment the reader count and get
- * the lock. It is possible that an incoming writer may steal the
- * lock in the interim, so it is necessary to check the writer byte
- * to make sure that the write lock isn't taken.
+ * The ACQUIRE semantics of the following spinning code ensure
+ * that accesses can't leak upwards out of our subsequent critical
+ * section in the case that the lock is currently held for write.
*/
- while (atomic_read(&lock->cnts) & _QW_WMASK)
- cpu_relax_lowlatency();
-
- cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
rspin_until_writer_unlock(lock, cnts);
/*
@@ -84,13 +101,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
*/
arch_spin_unlock(&lock->lock);
}
-EXPORT_SYMBOL(queue_read_lock_slowpath);
+EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * queued_write_lock_slowpath - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-void queue_write_lock_slowpath(struct qrwlock *lock)
+void queued_write_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;
@@ -99,7 +116,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
/* Try to acquire the lock directly if no reader is present */
if (!atomic_read(&lock->cnts) &&
- (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+ (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
goto unlock;
/*
@@ -107,10 +124,10 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
* or wait for a previous writer to go away.
*/
for (;;) {
- cnts = atomic_read(&lock->cnts);
- if (!(cnts & _QW_WMASK) &&
- (atomic_cmpxchg(&lock->cnts, cnts,
- cnts | _QW_WAITING) == cnts))
+ struct __qrwlock *l = (struct __qrwlock *)lock;
+
+ if (!READ_ONCE(l->wmode) &&
+ (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
break;
cpu_relax_lowlatency();
@@ -120,8 +137,8 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
for (;;) {
cnts = atomic_read(&lock->cnts);
if ((cnts == _QW_WAITING) &&
- (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) == _QW_WAITING))
+ (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) == _QW_WAITING))
break;
cpu_relax_lowlatency();
@@ -129,4 +146,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
unlock:
arch_spin_unlock(&lock->lock);
}
-EXPORT_SYMBOL(queue_write_lock_slowpath);
+EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644
index 000000000000..337c8818541d
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,473 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ * Peter Zijlstra <peterz@infradead.org>
+ */
+
+#ifndef _GEN_PV_LOCK_SLOWPATH
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to make
+ * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
+ * API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ * atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES 8
+#else
+#define MAX_NODES 4
+#endif
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline u32 encode_tail(int cpu, int idx)
+{
+ u32 tail;
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+ BUG_ON(idx > 3);
+#endif
+ tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+ tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+ return tail;
+}
+
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+ int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+ return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ *
+ * This internal structure is also used by the set_locked function which
+ * is not restricted to _Q_PENDING_BITS == 8.
+ */
+struct __qspinlock {
+ union {
+ atomic_t val;
+#ifdef __LITTLE_ENDIAN
+ struct {
+ u8 locked;
+ u8 pending;
+ };
+ struct {
+ u16 locked_pending;
+ u16 tail;
+ };
+#else
+ struct {
+ u16 tail;
+ u16 locked_pending;
+ };
+ struct {
+ u8 reserved[2];
+ u8 pending;
+ u8 locked;
+ };
+#endif
+ };
+};
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ u32 old, new, val = atomic_read(&lock->val);
+
+ for (;;) {
+ new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+ old = atomic_cmpxchg(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+ return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+
+
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
+
+#define pv_enabled() false
+
+#define pv_init_node __pv_init_node
+#define pv_wait_node __pv_wait_node
+#define pv_kick_node __pv_kick_node
+#define pv_wait_head __pv_wait_head
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
+#endif
+
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ * fast : slow : unlock
+ * : :
+ * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ * : | ^--------.------. / :
+ * : v \ \ | :
+ * pending : (0,1,1) +--> (0,1,0) \ | :
+ * : | ^--' | | :
+ * : v | | :
+ * uncontended : (n,x,y) +--> (n,0,0) --' | :
+ * queue : | ^--' | :
+ * : v | :
+ * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
+ * queue : ^--' :
+ */
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ u32 new, old, tail;
+ int idx;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+ if (pv_enabled())
+ goto queue;
+
+ if (virt_queued_spin_lock(lock))
+ return;
+
+ /*
+ * wait for in-progress pending->locked hand-overs
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ if (val == _Q_PENDING_VAL) {
+ while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+ cpu_relax();
+ }
+
+ /*
+ * trylock || pending
+ *
+ * 0,0,0 -> 0,0,1 ; trylock
+ * 0,0,1 -> 0,1,1 ; pending
+ */
+ for (;;) {
+ /*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ new = _Q_LOCKED_VAL;
+ if (val == new)
+ new |= _Q_PENDING_VAL;
+
+ old = atomic_cmpxchg(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ /*
+ * we won the trylock
+ */
+ if (new == _Q_LOCKED_VAL)
+ return;
+
+ /*
+ * we're pending, wait for the owner to go away.
+ *
+ * *,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all clear_pending_set_locked()
+ * implementations imply full barriers.
+ */
+ while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
+ cpu_relax();
+
+ /*
+ * take ownership and clear the pending bit.
+ *
+ * *,1,0 -> *,0,1
+ */
+ clear_pending_set_locked(lock);
+ return;
+
+ /*
+ * End of pending bit optimistic spinning and beginning of MCS
+ * queuing.
+ */
+queue:
+ node = this_cpu_ptr(&mcs_nodes[0]);
+ idx = node->count++;
+ tail = encode_tail(smp_processor_id(), idx);
+
+ node += idx;
+ node->locked = 0;
+ node->next = NULL;
+ pv_init_node(node);
+
+ /*
+ * We touched a (possibly) cold cacheline in the per-cpu queue node;
+ * attempt the trylock once more in the hope someone let go while we
+ * weren't watching.
+ */
+ if (queued_spin_trylock(lock))
+ goto release;
+
+ /*
+ * We have already touched the queueing cacheline; don't bother with
+ * pending stuff.
+ *
+ * p,*,* -> n,*,*
+ */
+ old = xchg_tail(lock, tail);
+
+ /*
+ * if there was a previous node; link it and wait until reaching the
+ * head of the waitqueue.
+ */
+ if (old & _Q_TAIL_MASK) {
+ prev = decode_tail(old);
+ WRITE_ONCE(prev->next, node);
+
+ pv_wait_node(node);
+ arch_mcs_spin_lock_contended(&node->locked);
+ }
+
+ /*
+ * we're at the head of the waitqueue, wait for the owner & pending to
+ * go away.
+ *
+ * *,x,y -> *,0,0
+ *
+ * this wait loop must use a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because the set_locked() function below
+ * does not imply a full barrier.
+ *
+ */
+ pv_wait_head(lock, node);
+ while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+ cpu_relax();
+
+ /*
+ * claim the lock:
+ *
+ * n,0,0 -> 0,0,1 : lock, uncontended
+ * *,0,0 -> *,0,1 : lock, contended
+ *
+ * If the queue head is the only one in the queue (lock value == tail),
+ * clear the tail code and grab the lock. Otherwise, we only need
+ * to grab the lock.
+ */
+ for (;;) {
+ if (val != tail) {
+ set_locked(lock);
+ break;
+ }
+ old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+ if (old == val)
+ goto release; /* No contention */
+
+ val = old;
+ }
+
+ /*
+ * contended path; wait for next, release.
+ */
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+ pv_kick_node(lock, next);
+
+release:
+ /*
+ * release the node
+ */
+ this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+
+#undef pv_enabled
+#define pv_enabled() true
+
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head
+
+#undef queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
+
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+
+#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644
index 000000000000..c8e6e9a596f5
--- /dev/null
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,378 @@
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+#include <linux/debug_locks.h>
+
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ * pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ * pv_kick(cpu) -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+
+#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
+
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
+enum vcpu_state {
+ vcpu_running = 0,
+ vcpu_halted, /* Used only in pv_wait_node */
+ vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
+};
+
+struct pv_node {
+ struct mcs_spinlock mcs;
+ struct mcs_spinlock __res[3];
+
+ int cpu;
+ u8 state;
+};
+
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+ struct qspinlock *lock;
+ struct pv_node *node;
+};
+
+#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry))
+
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+ int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+
+ if (pv_hash_size < PV_HE_MIN)
+ pv_hash_size = PV_HE_MIN;
+
+ /*
+ * Allocate space from bootmem which should be page-size aligned
+ * and hence cacheline aligned.
+ */
+ pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+ sizeof(struct pv_hash_entry),
+ pv_hash_size, 0, HASH_EARLY,
+ &pv_lock_hash_bits, NULL,
+ pv_hash_size, pv_hash_size);
+}
+
+#define for_each_hash_entry(he, offset, hash) \
+ for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \
+ offset < (1 << pv_lock_hash_bits); \
+ offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+
+ for_each_hash_entry(he, offset, hash) {
+ if (!cmpxchg(&he->lock, NULL, lock)) {
+ WRITE_ONCE(he->node, node);
+ return &he->lock;
+ }
+ }
+ /*
+ * Hard assume there is a free entry for us.
+ *
+ * This is guaranteed by ensuring every blocked lock only ever consumes
+ * a single entry, and since we only have 4 nesting levels per CPU
+ * and allocated 4*nr_possible_cpus(), this must be so.
+ *
+ * The single entry is guaranteed by having the lock owner unhash
+ * before it releases.
+ */
+ BUG();
+}
+
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+ struct pv_node *node;
+
+ for_each_hash_entry(he, offset, hash) {
+ if (READ_ONCE(he->lock) == lock) {
+ node = READ_ONCE(he->node);
+ WRITE_ONCE(he->lock, NULL);
+ return node;
+ }
+ }
+ /*
+ * Hard assume we'll find an entry.
+ *
+ * This guarantees a limited lookup time and is itself guaranteed by
+ * having the lock owner do the unhash -- IFF the unlock sees the
+ * SLOW flag, there MUST be a hash entry.
+ */
+ BUG();
+}
+
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+
+ BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+
+ pn->cpu = smp_processor_id();
+ pn->state = vcpu_running;
+}
+
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
+ */
+static void pv_wait_node(struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ int loop;
+
+ for (;;) {
+ for (loop = SPIN_THRESHOLD; loop; loop--) {
+ if (READ_ONCE(node->locked))
+ return;
+ cpu_relax();
+ }
+
+ /*
+ * Order pn->state vs pn->locked thusly:
+ *
+ * [S] pn->state = vcpu_halted [S] next->locked = 1
+ * MB MB
+ * [L] pn->locked [RmW] pn->state = vcpu_hashed
+ *
+ * Matches the cmpxchg() from pv_kick_node().
+ */
+ smp_store_mb(pn->state, vcpu_halted);
+
+ if (!READ_ONCE(node->locked))
+ pv_wait(&pn->state, vcpu_halted);
+
+ /*
+ * If pv_kick_node() changed us to vcpu_hashed, retain that value
+ * so that pv_wait_head() knows to not also try to hash this lock.
+ */
+ cmpxchg(&pn->state, vcpu_halted, vcpu_running);
+
+ /*
+ * If the locked flag is still not set after wakeup, it is a
+ * spurious wakeup and the vCPU should wait again. However,
+ * there is a pretty high overhead for CPU halting and kicking.
+ * So it is better to spin for a while in the hope that the
+ * MCS lock will be released soon.
+ */
+ }
+
+ /*
+ * By now our node->locked should be 1 and our caller will not actually
+ * spin-wait for it. We do however rely on our caller to do a
+ * load-acquire for us.
+ */
+}
+
+/*
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ */
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ struct __qspinlock *l = (void *)lock;
+
+ /*
+ * If the vCPU is indeed halted, advance its state to match that of
+ * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+ * observe its next->locked value and advance itself.
+ *
+ * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ */
+ if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+ return;
+
+ /*
+ * Put the lock into the hash table and set the _Q_SLOW_VAL.
+ *
+ * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+ * the hash table later on at unlock time, no atomic instruction is
+ * needed.
+ */
+ WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+ (void)pv_hash(lock, pn);
+}
+
+/*
+ * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ */
+static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ struct __qspinlock *l = (void *)lock;
+ struct qspinlock **lp = NULL;
+ int loop;
+
+ /*
+ * If pv_kick_node() already advanced our state, we don't need to
+ * insert ourselves into the hash table anymore.
+ */
+ if (READ_ONCE(pn->state) == vcpu_hashed)
+ lp = (struct qspinlock **)1;
+
+ for (;;) {
+ for (loop = SPIN_THRESHOLD; loop; loop--) {
+ if (!READ_ONCE(l->locked))
+ return;
+ cpu_relax();
+ }
+
+ if (!lp) { /* ONCE */
+ WRITE_ONCE(pn->state, vcpu_hashed);
+ lp = pv_hash(lock, pn);
+
+ /*
+ * We must hash before setting _Q_SLOW_VAL, such that
+ * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+ * we'll be sure to be able to observe our hash entry.
+ *
+ * [S] pn->state
+ * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL
+ * MB RMB
+ * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash>
+ * [L] pn->state
+ *
+ * Matches the smp_rmb() in __pv_queued_spin_unlock().
+ */
+ if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+ /*
+ * The lock is free and _Q_SLOW_VAL has never
+ * been set. Therefore we need to unhash before
+ * getting the lock.
+ */
+ WRITE_ONCE(*lp, NULL);
+ return;
+ }
+ }
+ pv_wait(&l->locked, _Q_SLOW_VAL);
+
+ /*
+ * The unlocker should have freed the lock before kicking the
+ * CPU. So if the lock is still not free, it is a spurious
+ * wakeup and so the vCPU should wait again after spinning for
+ * a while.
+ */
+ }
+
+ /*
+ * Lock is unlocked now; the caller will acquire it without waiting.
+ * As with pv_wait_node() we rely on the caller to do a load-acquire
+ * for us.
+ */
+}
+
+/*
+ * PV version of the unlock function to be used in stead of
+ * queued_spin_unlock().
+ */
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ struct pv_node *node;
+ u8 locked;
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
+ return;
+
+ if (unlikely(locked != _Q_SLOW_VAL)) {
+ WARN(!debug_locks_silent,
+ "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+ (unsigned long)lock, atomic_read(&lock->val));
+ return;
+ }
+
+ /*
+ * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+ * so we need a barrier to order the read of the node data in
+ * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+ *
+ * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ */
+ smp_rmb();
+
+ /*
+ * Since the above failed to release, this must be the SLOW path.
+ * Therefore start by looking up the blocked node and unhashing it.
+ */
+ node = pv_unhash(lock);
+
+ /*
+ * Now that we have a reference to the (likely) blocked pv_node,
+ * release the lock.
+ */
+ smp_store_release(&l->locked, 0);
+
+ /*
+ * At this point the memory pointed at by lock can be freed/reused,
+ * however we can still use the pv_node to kick the CPU.
+ * The other vCPU may not really be halted, but kicking an active
+ * vCPU is harmless other than the additional latency in completing
+ * the unlock.
+ */
+ if (READ_ONCE(node->state) == vcpu_hashed)
+ pv_kick(node->cpu);
+}
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() near the top of the file to make sure
+ * that the callee-save thunk and the real unlock function are close
+ * to each other sharing consecutive instruction cachelines.
+ */
+#include <asm/qspinlock_paravirt.h>
+
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
deleted file mode 100644
index 1d96dd0d93c1..000000000000
--- a/kernel/locking/rtmutex-tester.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * RT-Mutex-tester: scriptable tester for rt mutexes
- *
- * started by Thomas Gleixner:
- *
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- */
-#include <linux/device.h>
-#include <linux/kthread.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/freezer.h>
-#include <linux/stat.h>
-
-#include "rtmutex.h"
-
-#define MAX_RT_TEST_THREADS 8
-#define MAX_RT_TEST_MUTEXES 8
-
-static spinlock_t rttest_lock;
-static atomic_t rttest_event;
-
-struct test_thread_data {
- int opcode;
- int opdata;
- int mutexes[MAX_RT_TEST_MUTEXES];
- int event;
- struct device dev;
-};
-
-static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
-static struct task_struct *threads[MAX_RT_TEST_THREADS];
-static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
-
-enum test_opcodes {
- RTTEST_NOP = 0,
- RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
- RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
- RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
- RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
- RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
- RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
- RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
- RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
- /* 9, 10 - reserved for BKL commemoration */
- RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
- RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
- RTTEST_RESET = 99, /* 99 Reset all pending operations */
-};
-
-static int handle_op(struct test_thread_data *td, int lockwakeup)
-{
- int i, id, ret = -EINVAL;
-
- switch(td->opcode) {
-
- case RTTEST_NOP:
- return 0;
-
- case RTTEST_LOCKCONT:
- td->mutexes[td->opdata] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- return 0;
-
- case RTTEST_RESET:
- for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
- if (td->mutexes[i] == 4) {
- rt_mutex_unlock(&mutexes[i]);
- td->mutexes[i] = 0;
- }
- }
- return 0;
-
- case RTTEST_RESETEVENT:
- atomic_set(&rttest_event, 0);
- return 0;
-
- default:
- if (lockwakeup)
- return ret;
- }
-
- switch(td->opcode) {
-
- case RTTEST_LOCK:
- case RTTEST_LOCKNOWAIT:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
- return ret;
-
- td->mutexes[id] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- rt_mutex_lock(&mutexes[id]);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = 4;
- return 0;
-
- case RTTEST_LOCKINT:
- case RTTEST_LOCKINTNOWAIT:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
- return ret;
-
- td->mutexes[id] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = ret ? 0 : 4;
- return ret ? -EINTR : 0;
-
- case RTTEST_UNLOCK:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
- return ret;
-
- td->event = atomic_add_return(1, &rttest_event);
- rt_mutex_unlock(&mutexes[id]);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = 0;
- return 0;
-
- default:
- break;
- }
- return ret;
-}
-
-/*
- * Schedule replacement for rtsem_down(). Only called for threads with
- * PF_MUTEX_TESTER set.
- *
- * This allows us to have finegrained control over the event flow.
- *
- */
-void schedule_rt_mutex_test(struct rt_mutex *mutex)
-{
- int tid, op, dat;
- struct test_thread_data *td;
-
- /* We have to lookup the task */
- for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
- if (threads[tid] == current)
- break;
- }
-
- BUG_ON(tid == MAX_RT_TEST_THREADS);
-
- td = &thread_data[tid];
-
- op = td->opcode;
- dat = td->opdata;
-
- switch (op) {
- case RTTEST_LOCK:
- case RTTEST_LOCKINT:
- case RTTEST_LOCKNOWAIT:
- case RTTEST_LOCKINTNOWAIT:
- if (mutex != &mutexes[dat])
- break;
-
- if (td->mutexes[dat] != 1)
- break;
-
- td->mutexes[dat] = 2;
- td->event = atomic_add_return(1, &rttest_event);
- break;
-
- default:
- break;
- }
-
- schedule();
-
-
- switch (op) {
- case RTTEST_LOCK:
- case RTTEST_LOCKINT:
- if (mutex != &mutexes[dat])
- return;
-
- if (td->mutexes[dat] != 2)
- return;
-
- td->mutexes[dat] = 3;
- td->event = atomic_add_return(1, &rttest_event);
- break;
-
- case RTTEST_LOCKNOWAIT:
- case RTTEST_LOCKINTNOWAIT:
- if (mutex != &mutexes[dat])
- return;
-
- if (td->mutexes[dat] != 2)
- return;
-
- td->mutexes[dat] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- return;
-
- default:
- return;
- }
-
- td->opcode = 0;
-
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (td->opcode > 0) {
- int ret;
-
- set_current_state(TASK_RUNNING);
- ret = handle_op(td, 1);
- set_current_state(TASK_INTERRUPTIBLE);
- if (td->opcode == RTTEST_LOCKCONT)
- break;
- td->opcode = ret;
- }
-
- /* Wait for the next command to be executed */
- schedule();
- }
-
- /* Restore previous command and data */
- td->opcode = op;
- td->opdata = dat;
-}
-
-static int test_func(void *data)
-{
- struct test_thread_data *td = data;
- int ret;
-
- current->flags |= PF_MUTEX_TESTER;
- set_freezable();
- allow_signal(SIGHUP);
-
- for(;;) {
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (td->opcode > 0) {
- set_current_state(TASK_RUNNING);
- ret = handle_op(td, 0);
- set_current_state(TASK_INTERRUPTIBLE);
- td->opcode = ret;
- }
-
- /* Wait for the next command to be executed */
- schedule();
- try_to_freeze();
-
- if (signal_pending(current))
- flush_signals(current);
-
- if(kthread_should_stop())
- break;
- }
- return 0;
-}
-
-/**
- * sysfs_test_command - interface for test commands
- * @dev: thread reference
- * @buf: command for actual step
- * @count: length of buffer
- *
- * command syntax:
- *
- * opcode:data
- */
-static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct sched_param schedpar;
- struct test_thread_data *td;
- char cmdbuf[32];
- int op, dat, tid, ret;
-
- td = container_of(dev, struct test_thread_data, dev);
- tid = td->dev.id;
-
- /* strings from sysfs write are not 0 terminated! */
- if (count >= sizeof(cmdbuf))
- return -EINVAL;
-
- /* strip of \n: */
- if (buf[count-1] == '\n')
- count--;
- if (count < 1)
- return -EINVAL;
-
- memcpy(cmdbuf, buf, count);
- cmdbuf[count] = 0;
-
- if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
- return -EINVAL;
-
- switch (op) {
- case RTTEST_SCHEDOT:
- schedpar.sched_priority = 0;
- ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
- if (ret)
- return ret;
- set_user_nice(current, 0);
- break;
-
- case RTTEST_SCHEDRT:
- schedpar.sched_priority = dat;
- ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
- if (ret)
- return ret;
- break;
-
- case RTTEST_SIGNAL:
- send_sig(SIGHUP, threads[tid], 0);
- break;
-
- default:
- if (td->opcode > 0)
- return -EBUSY;
- td->opdata = dat;
- td->opcode = op;
- wake_up_process(threads[tid]);
- }
-
- return count;
-}
-
-/**
- * sysfs_test_status - sysfs interface for rt tester
- * @dev: thread to query
- * @buf: char buffer to be filled with thread status info
- */
-static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct test_thread_data *td;
- struct task_struct *tsk;
- char *curr = buf;
- int i;
-
- td = container_of(dev, struct test_thread_data, dev);
- tsk = threads[td->dev.id];
-
- spin_lock(&rttest_lock);
-
- curr += sprintf(curr,
- "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
- td->opcode, td->event, tsk->state,
- (MAX_RT_PRIO - 1) - tsk->prio,
- (MAX_RT_PRIO - 1) - tsk->normal_prio,
- tsk->pi_blocked_on);
-
- for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
- curr += sprintf(curr, "%d", td->mutexes[i]);
-
- spin_unlock(&rttest_lock);
-
- curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
- mutexes[td->dev.id].owner);
-
- return curr - buf;
-}
-
-static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
-
-static struct bus_type rttest_subsys = {
- .name = "rttest",
- .dev_name = "rttest",
-};
-
-static int init_test_thread(int id)
-{
- thread_data[id].dev.bus = &rttest_subsys;
- thread_data[id].dev.id = id;
-
- threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
- if (IS_ERR(threads[id]))
- return PTR_ERR(threads[id]);
-
- return device_register(&thread_data[id].dev);
-}
-
-static int init_rttest(void)
-{
- int ret, i;
-
- spin_lock_init(&rttest_lock);
-
- for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
- rt_mutex_init(&mutexes[i]);
-
- ret = subsys_system_register(&rttest_subsys, NULL);
- if (ret)
- return ret;
-
- for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
- ret = init_test_thread(i);
- if (ret)
- break;
- ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
- if (ret)
- break;
- ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
- if (ret)
- break;
- }
-
- printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
-
- return ret;
-}
-
-device_initcall(init_rttest);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6357265a31ad..7781d801212f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -70,10 +70,10 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
}
/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
+ * We can speed up the acquire/release, if there's no debugging state to be
+ * set up.
*/
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+#ifndef CONFIG_DEBUG_RT_MUTEXES
# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
{
@@ -265,15 +265,17 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
}
/*
- * Called by sched_setscheduler() to check whether the priority change
- * is overruled by a possible priority boosting.
+ * Called by sched_setscheduler() to get the priority which will be
+ * effective after the change.
*/
-int rt_mutex_check_prio(struct task_struct *task, int newprio)
+int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
{
if (!task_has_pi_waiters(task))
- return 0;
+ return newprio;
- return task_top_pi_waiter(task)->task->prio <= newprio;
+ if (task_top_pi_waiter(task)->task->prio <= newprio)
+ return task_top_pi_waiter(task)->task->prio;
+ return newprio;
}
/*
@@ -298,7 +300,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
* of task. We do not use the spin_xx_mutex() variants here as we are
* outside of the debug path.)
*/
-static void rt_mutex_adjust_prio(struct task_struct *task)
+void rt_mutex_adjust_prio(struct task_struct *task)
{
unsigned long flags;
@@ -349,7 +351,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
*
* @task: the task owning the mutex (owner) for which a chain walk is
* probably needed
- * @deadlock_detect: do we have to carry out deadlock detection?
+ * @chwalk: do we have to carry out deadlock detection?
* @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
* things for a task that has just got its priority adjusted, and
* is waiting on a mutex)
@@ -622,7 +624,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
prerequeue_top_waiter = rt_mutex_top_waiter(lock);
- /* [7] Requeue the waiter in the lock waiter list. */
+ /* [7] Requeue the waiter in the lock waiter tree. */
rt_mutex_dequeue(lock, waiter);
waiter->prio = task->prio;
rt_mutex_enqueue(lock, waiter);
@@ -660,7 +662,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* The waiter became the new top (highest priority)
* waiter on the lock. Replace the previous top waiter
- * in the owner tasks pi waiters list with this waiter
+ * in the owner tasks pi waiters tree with this waiter
* and adjust the priority of the owner.
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
@@ -671,7 +673,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* The waiter was the top waiter on the lock, but is
* no longer the top prority waiter. Replace waiter in
- * the owner tasks pi waiters list with the new top
+ * the owner tasks pi waiters tree with the new top
* (highest priority) waiter and adjust the priority
* of the owner.
* The new top waiter is stored in @waiter so that
@@ -745,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*
* @lock: The lock to be acquired.
* @task: The task which wants to acquire the lock
- * @waiter: The waiter that is queued to the lock's wait list if the
+ * @waiter: The waiter that is queued to the lock's wait tree if the
* callsite called task_blocked_on_lock(), otherwise NULL
*/
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
@@ -780,7 +782,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
/*
* If @waiter != NULL, @task has already enqueued the waiter
- * into @lock waiter list. If @waiter == NULL then this is a
+ * into @lock waiter tree. If @waiter == NULL then this is a
* trylock attempt.
*/
if (waiter) {
@@ -793,7 +795,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
/*
* We can acquire the lock. Remove the waiter from the
- * lock waiters list.
+ * lock waiters tree.
*/
rt_mutex_dequeue(lock, waiter);
@@ -825,7 +827,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* No waiters. Take the lock without the
* pi_lock dance.@task->pi_blocked_on is NULL
* and we have no waiters to enqueue in @task
- * pi waiters list.
+ * pi waiters tree.
*/
goto takeit;
}
@@ -842,7 +844,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
/*
* Finish the lock acquisition. @task is the new owner. If
* other waiters exist we have to insert the highest priority
- * waiter into @task->pi_waiters list.
+ * waiter into @task->pi_waiters tree.
*/
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
@@ -953,14 +955,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
}
/*
- * Wake up the next waiter on the lock.
- *
- * Remove the top waiter from the current tasks pi waiter list and
- * wake it up.
+ * Remove the top waiter from the current tasks pi waiter tree and
+ * queue it up.
*
* Called with lock->wait_lock held.
*/
-static void wakeup_next_waiter(struct rt_mutex *lock)
+static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
+ struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
unsigned long flags;
@@ -989,12 +990,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
- /*
- * It's safe to dereference waiter as it cannot go away as
- * long as we hold lock->wait_lock. The waiter task needs to
- * acquire it in order to dequeue the waiter.
- */
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
}
/*
@@ -1124,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
debug_rt_mutex_print_deadlock(waiter);
- schedule_rt_mutex(lock);
+ schedule();
raw_spin_lock(&lock->wait_lock);
set_current_state(state);
@@ -1180,11 +1176,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(state);
/* Setup the timer, when timeout != NULL */
- if (unlikely(timeout)) {
+ if (unlikely(timeout))
hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&timeout->timer))
- timeout->task = NULL;
- }
ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
@@ -1251,10 +1244,11 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
}
/*
- * Slow path to release a rt-mutex:
+ * Slow path to release a rt-mutex.
+ * Return whether the current task needs to undo a potential priority boosting.
*/
-static void __sched
-rt_mutex_slowunlock(struct rt_mutex *lock)
+static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q)
{
raw_spin_lock(&lock->wait_lock);
@@ -1296,7 +1290,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
if (unlock_rt_mutex_safe(lock) == true)
- return;
+ return false;
/* Relock the rtmutex and try again */
raw_spin_lock(&lock->wait_lock);
}
@@ -1304,13 +1298,15 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
/*
* The wakeup next waiter path does not suffer from the above
* race. See the comments there.
+ *
+ * Queue the next waiter for wakeup once we release the wait_lock.
*/
- wakeup_next_waiter(lock);
+ mark_wakeup_next_waiter(wake_q, lock);
raw_spin_unlock(&lock->wait_lock);
- /* Undo pi boosting if necessary: */
- rt_mutex_adjust_prio(current);
+ /* check PI boosting */
+ return true;
}
/*
@@ -1361,12 +1357,23 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
static inline void
rt_mutex_fastunlock(struct rt_mutex *lock,
- void (*slowfn)(struct rt_mutex *lock))
+ bool (*slowfn)(struct rt_mutex *lock,
+ struct wake_q_head *wqh))
{
- if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+ WAKE_Q(wake_q);
+
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
rt_mutex_deadlock_account_unlock(current);
- else
- slowfn(lock);
+
+ } else {
+ bool deboost = slowfn(lock, &wake_q);
+
+ wake_up_q(&wake_q);
+
+ /* Undo pi boosting if necessary: */
+ if (deboost)
+ rt_mutex_adjust_prio(current);
+ }
}
/**
@@ -1441,10 +1448,17 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
*
* @lock: the rt_mutex to be locked
*
+ * This function can only be called in thread context. It's safe to
+ * call it from atomic regions, but not from hard interrupt or soft
+ * interrupt context.
+ *
* Returns 1 on success and 0 on contention
*/
int __sched rt_mutex_trylock(struct rt_mutex *lock)
{
+ if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+ return 0;
+
return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
}
EXPORT_SYMBOL_GPL(rt_mutex_trylock);
@@ -1461,6 +1475,23 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
+ * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
+ * @lock: the rt_mutex to be unlocked
+ *
+ * Returns: true/false indicating whether priority adjustment is
+ * required or not.
+ */
+bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh)
+{
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+ rt_mutex_deadlock_account_unlock(current);
+ return false;
+ }
+ return rt_mutex_slowunlock(lock, wqh);
+}
+
+/**
* rt_mutex_destroy - mark a mutex unusable
* @lock: the mutex to be destroyed
*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 855212501407..4f5f83c7d2d3 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -15,28 +15,6 @@
#include <linux/rtmutex.h>
/*
- * The rtmutex in kernel tester is independent of rtmutex debugging. We
- * call schedule_rt_mutex_test() instead of schedule() for the tasks which
- * belong to the tester. That way we can delay the wakeup path of those
- * threads to provoke lock stealing and testing of complex boosting scenarios.
- */
-#ifdef CONFIG_RT_MUTEX_TESTER
-
-extern void schedule_rt_mutex_test(struct rt_mutex *lock);
-
-#define schedule_rt_mutex(_lock) \
- do { \
- if (!(current->flags & PF_MUTEX_TESTER)) \
- schedule(); \
- else \
- schedule_rt_mutex_test(_lock); \
- } while (0)
-
-#else
-# define schedule_rt_mutex(_lock) schedule()
-#endif
-
-/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
@@ -131,6 +109,9 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
struct rt_mutex_waiter *waiter);
extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh);
+extern void rt_mutex_adjust_prio(struct task_struct *task);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2555ae15ec14..3a5048572065 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
list_del(&waiter->list);
tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
smp_mb();
waiter->task = NULL;
wake_up_process(tsk);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2f7cc4076f50..0f189714e457 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -14,8 +14,9 @@
#include <linux/init.h>
#include <linux/export.h>
#include <linux/sched/rt.h>
+#include <linux/osq_lock.h>
-#include "mcs_spinlock.h"
+#include "rwsem.h"
/*
* Guide to the rw_semaphore's count field for common values.
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
waiter = list_entry(next, struct rwsem_waiter, list);
next = waiter->list.next;
tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
smp_mb();
waiter->task = NULL;
wake_up_process(tsk);
@@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
if (!list_is_singular(&sem->wait_list))
rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ rwsem_set_owner(sem);
return true;
}
@@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
- long old, count = ACCESS_ONCE(sem->count);
+ long old, count = READ_ONCE(sem->count);
while (true) {
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;
old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
- if (old == count)
+ if (old == count) {
+ rwsem_set_owner(sem);
return true;
+ }
count = old;
}
@@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
- bool on_cpu = false;
+ bool ret = true;
if (need_resched())
return false;
rcu_read_lock();
- owner = ACCESS_ONCE(sem->owner);
- if (owner)
- on_cpu = owner->on_cpu;
- rcu_read_unlock();
-
- /*
- * If sem->owner is not set, yet we have just recently entered the
- * slowpath, then there is a possibility reader(s) may have the lock.
- * To be safe, avoid spinning in these situations.
- */
- return on_cpu;
-}
-
-static inline bool owner_running(struct rw_semaphore *sem,
- struct task_struct *owner)
-{
- if (sem->owner != owner)
- return false;
-
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * sem->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
- */
- barrier();
+ owner = READ_ONCE(sem->owner);
+ if (!owner) {
+ long count = READ_ONCE(sem->count);
+ /*
+ * If sem->owner is not set, yet we have just recently entered the
+ * slowpath with the lock being active, then there is a possibility
+ * reader(s) may have the lock. To be safe, bail spinning in these
+ * situations.
+ */
+ if (count & RWSEM_ACTIVE_MASK)
+ ret = false;
+ goto done;
+ }
- return owner->on_cpu;
+ ret = owner->on_cpu;
+done:
+ rcu_read_unlock();
+ return ret;
}
static noinline
bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
{
+ long count;
+
rcu_read_lock();
- while (owner_running(sem, owner)) {
- if (need_resched())
- break;
+ while (sem->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking sem->owner still matches owner, if that fails,
+ * owner might point to free()d memory, if it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ /* abort spinning when need_resched or owner is not running */
+ if (!owner->on_cpu || need_resched()) {
+ rcu_read_unlock();
+ return false;
+ }
cpu_relax_lowlatency();
}
rcu_read_unlock();
+ if (READ_ONCE(sem->owner))
+ return true; /* new owner, continue spinning */
+
/*
- * We break out the loop above on need_resched() or when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when sem->owner is NULL.
+ * When the owner is not set, the lock could be free or
+ * held by readers. Check the counter to verify the
+ * state.
*/
- return sem->owner == NULL;
+ count = READ_ONCE(sem->count);
+ return (count == 0 || count == RWSEM_WAITING_BIAS);
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
goto done;
while (true) {
- owner = ACCESS_ONCE(sem->owner);
+ owner = READ_ONCE(sem->owner);
if (owner && !rwsem_spin_on_owner(sem, owner))
break;
@@ -391,11 +409,24 @@ done:
return taken;
}
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+ return osq_is_locked(&sem->osq);
+}
+
#else
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
return false;
}
+
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+ return false;
+}
#endif
/*
@@ -432,7 +463,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
/* we're now waiting on the lock, but no longer actively locking */
if (waiting) {
- count = ACCESS_ONCE(sem->count);
+ count = READ_ONCE(sem->count);
/*
* If there were already threads queued before us and there are
@@ -478,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ /*
+ * If a spinner is present, it is not necessary to do the wakeup.
+ * Try to do wakeup only if the trylock succeeds to minimize
+ * spinlock contention which may introduce too much delay in the
+ * unlock operation.
+ *
+ * spinning writer up_write/up_read caller
+ * --------------- -----------------------
+ * [S] osq_unlock() [L] osq
+ * MB RMB
+ * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+ *
+ * Here, it is important to make sure that there won't be a missed
+ * wakeup while the rwsem is free and the only spinning writer goes
+ * to sleep without taking the rwsem. Even when the spinning writer
+ * is just going to break out of the waiting loop, it will still do
+ * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
+ * rwsem_has_spinner() is true, it will guarantee at least one
+ * trylock attempt on the rwsem later on.
+ */
+ if (rwsem_has_spinner(sem)) {
+ /*
+ * The smp_rmb() here is to make sure that the spinner
+ * state is consulted before reading the wait_lock.
+ */
+ smp_rmb();
+ if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+ return sem;
+ goto locked;
+ }
raw_spin_lock_irqsave(&sem->wait_lock, flags);
+locked:
/* do nothing if list empty */
if (!list_empty(&sem->wait_list))
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e2d3bc7f03b4..205be0ce34de 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -9,29 +9,9 @@
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/rwsem.h>
-
#include <linux/atomic.h>
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
- sem->owner = current;
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
- sem->owner = NULL;
-}
-
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-#endif
+#include "rwsem.h"
/*
* lock for reading
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 000000000000..870ed9a5b426
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,20 @@
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
diff --git a/kernel/module.c b/kernel/module.c
index ec53f594e9c9..b86b7bf1be38 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,7 +18,7 @@
*/
#include <linux/export.h>
#include <linux/moduleloader.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
#include <linux/file.h>
@@ -101,48 +101,201 @@
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
-#ifdef CONFIG_KGDB_KDB
-struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
-#endif /* CONFIG_KGDB_KDB */
-#ifdef CONFIG_MODULE_SIG
-#ifdef CONFIG_MODULE_SIG_FORCE
-static bool sig_enforce = true;
-#else
-static bool sig_enforce = false;
+#ifdef CONFIG_MODULES_TREE_LOOKUP
-static int param_set_bool_enable_only(const char *val,
- const struct kernel_param *kp)
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU-sched lookups of the address from any context.
+ *
+ * Because modules have two address ranges: init and core, we need two
+ * latch_tree_nodes entries. Therefore we need the back-pointer from
+ * mod_tree_node.
+ *
+ * Because init ranges are short lived we mark them unlikely and have placed
+ * them outside the critical cacheline in struct module.
+ *
+ * This is conditional on PERF_EVENTS || TRACING because those can really hit
+ * __module_address() hard by doing a lot of stack unwinding; potentially from
+ * NMI context.
+ */
+
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
{
- int err;
- bool test;
- struct kernel_param dummy_kp = *kp;
+ struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
+ struct module *mod = mtn->mod;
- dummy_kp.arg = &test;
+ if (unlikely(mtn == &mod->mtn_init))
+ return (unsigned long)mod->module_init;
- err = param_set_bool(val, &dummy_kp);
- if (err)
- return err;
+ return (unsigned long)mod->module_core;
+}
- /* Don't let them unset it once it's set! */
- if (!test && sig_enforce)
- return -EROFS;
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+ struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
+ struct module *mod = mtn->mod;
+
+ if (unlikely(mtn == &mod->mtn_init))
+ return (unsigned long)mod->init_size;
+
+ return (unsigned long)mod->core_size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+ return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+ unsigned long val = (unsigned long)key;
+ unsigned long start, end;
+
+ start = __mod_tree_val(n);
+ if (val < start)
+ return -1;
+
+ end = start + __mod_tree_size(n);
+ if (val >= end)
+ return 1;
- if (test)
- sig_enforce = true;
return 0;
}
-static const struct kernel_param_ops param_ops_bool_enable_only = {
- .flags = KERNEL_PARAM_OPS_FL_NOARG,
- .set = param_set_bool_enable_only,
- .get = param_get_bool,
+static const struct latch_tree_ops mod_tree_ops = {
+ .less = mod_tree_less,
+ .comp = mod_tree_comp,
+};
+
+static struct mod_tree_root {
+ struct latch_tree_root root;
+ unsigned long addr_min;
+ unsigned long addr_max;
+} mod_tree __cacheline_aligned = {
+ .addr_min = -1UL,
};
-#define param_check_bool_enable_only param_check_bool
+#define module_addr_min mod_tree.addr_min
+#define module_addr_max mod_tree.addr_max
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node)
+{
+ latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node)
+{
+ latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+static void mod_tree_insert(struct module *mod)
+{
+ mod->mtn_core.mod = mod;
+ mod->mtn_init.mod = mod;
+
+ __mod_tree_insert(&mod->mtn_core);
+ if (mod->init_size)
+ __mod_tree_insert(&mod->mtn_init);
+}
+
+static void mod_tree_remove_init(struct module *mod)
+{
+ if (mod->init_size)
+ __mod_tree_remove(&mod->mtn_init);
+}
+
+static void mod_tree_remove(struct module *mod)
+{
+ __mod_tree_remove(&mod->mtn_core);
+ mod_tree_remove_init(mod);
+}
+
+static struct module *mod_find(unsigned long addr)
+{
+ struct latch_tree_node *ltn;
+
+ ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
+ if (!ltn)
+ return NULL;
+
+ return container_of(ltn, struct mod_tree_node, node)->mod;
+}
+
+#else /* MODULES_TREE_LOOKUP */
+
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+
+static void mod_tree_insert(struct module *mod) { }
+static void mod_tree_remove_init(struct module *mod) { }
+static void mod_tree_remove(struct module *mod) { }
+
+static struct module *mod_find(unsigned long addr)
+{
+ struct module *mod;
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (within_module(addr, mod))
+ return mod;
+ }
+
+ return NULL;
+}
+
+#endif /* MODULES_TREE_LOOKUP */
+
+/*
+ * Bounds of module text, for speeding up __module_address.
+ * Protected by module_mutex.
+ */
+static void __mod_update_bounds(void *base, unsigned int size)
+{
+ unsigned long min = (unsigned long)base;
+ unsigned long max = min + size;
+
+ if (min < module_addr_min)
+ module_addr_min = min;
+ if (max > module_addr_max)
+ module_addr_max = max;
+}
+
+static void mod_update_bounds(struct module *mod)
+{
+ __mod_update_bounds(mod->module_core, mod->core_size);
+ if (mod->init_size)
+ __mod_update_bounds(mod->module_init, mod->init_size);
+}
+
+#ifdef CONFIG_KGDB_KDB
+struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+#endif /* CONFIG_KGDB_KDB */
+
+static void module_assert_mutex(void)
+{
+ lockdep_assert_held(&module_mutex);
+}
+
+static void module_assert_mutex_or_preempt(void)
+{
+#ifdef CONFIG_LOCKDEP
+ if (unlikely(!debug_locks))
+ return;
+
+ WARN_ON(!rcu_read_lock_sched_held() &&
+ !lockdep_is_held(&module_mutex));
+#endif
+}
+
+static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+#ifndef CONFIG_MODULE_SIG_FORCE
module_param(sig_enforce, bool_enable_only, 0644);
#endif /* !CONFIG_MODULE_SIG_FORCE */
-#endif /* CONFIG_MODULE_SIG */
/* Block module loading/unloading? */
int modules_disabled = 0;
@@ -153,10 +306,6 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-/* Bounds of module allocation, for speeding __module_address.
- * Protected by module_mutex. */
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
int register_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -318,6 +467,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
#endif
};
+ module_assert_mutex_or_preempt();
+
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
return true;
@@ -387,9 +538,9 @@ static bool check_symbol(const struct symsearch *syms,
pr_warn("Symbol %s is marked as UNUSED, however this module is "
"using it.\n", fsa->name);
pr_warn("This symbol will go away in the future.\n");
- pr_warn("Please evalute if this is the right api to use and if "
- "it really is, submit a report the linux kernel "
- "mailinglist together with submitting your code for "
+ pr_warn("Please evaluate if this is the right api to use and "
+ "if it really is, submit a report to the linux kernel "
+ "mailing list together with submitting your code for "
"inclusion.\n");
}
#endif
@@ -451,12 +602,17 @@ const struct kernel_symbol *find_symbol(const char *name,
}
EXPORT_SYMBOL_GPL(find_symbol);
-/* Search for module by name: must hold module_mutex. */
+/*
+ * Search for module by name: must hold module_mutex (or preempt disabled
+ * for read-only access).
+ */
static struct module *find_module_all(const char *name, size_t len,
bool even_unformed)
{
struct module *mod;
+ module_assert_mutex_or_preempt();
+
list_for_each_entry(mod, &modules, list) {
if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
continue;
@@ -468,6 +624,7 @@ static struct module *find_module_all(const char *name, size_t len,
struct module *find_module(const char *name)
{
+ module_assert_mutex();
return find_module_all(name, strlen(name), false);
}
EXPORT_SYMBOL_GPL(find_module);
@@ -1169,11 +1326,17 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
{
const unsigned long *crc;
- /* Since this should be found in kernel (which can't be removed),
- * no locking is necessary. */
+ /*
+ * Since this should be found in kernel (which can't be removed), no
+ * locking is necessary -- use preempt_disable() to placate lockdep.
+ */
+ preempt_disable();
if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
- &crc, true, false))
+ &crc, true, false)) {
+ preempt_enable();
BUG();
+ }
+ preempt_enable();
return check_version(sechdrs, versindex,
VMLINUX_SYMBOL_STR(module_layout), mod, crc,
NULL);
@@ -1661,6 +1824,10 @@ static void mod_sysfs_fini(struct module *mod)
mod_kobject_put(mod);
}
+static void init_param_lock(struct module *mod)
+{
+ mutex_init(&mod->param_lock);
+}
#else /* !CONFIG_SYSFS */
static int mod_sysfs_setup(struct module *mod,
@@ -1683,6 +1850,9 @@ static void del_usage_links(struct module *mod)
{
}
+static void init_param_lock(struct module *mod)
+{
+}
#endif /* CONFIG_SYSFS */
static void mod_sysfs_teardown(struct module *mod)
@@ -1852,10 +2022,11 @@ static void free_module(struct module *mod)
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
/* Remove this module from bug list, this uses list_del_rcu */
module_bug_cleanup(mod);
- /* Wait for RCU synchronizing before releasing mod->list and buglist. */
- synchronize_rcu();
+ /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
+ synchronize_sched();
mutex_unlock(&module_mutex);
/* This may be NULL, but that's OK */
@@ -2384,22 +2555,6 @@ void * __weak module_alloc(unsigned long size)
return vmalloc_exec(size);
}
-static void *module_alloc_update_bounds(unsigned long size)
-{
- void *ret = module_alloc(size);
-
- if (ret) {
- mutex_lock(&module_mutex);
- /* Update module bounds. */
- if ((unsigned long)ret < module_addr_min)
- module_addr_min = (unsigned long)ret;
- if ((unsigned long)ret + size > module_addr_max)
- module_addr_max = (unsigned long)ret + size;
- mutex_unlock(&module_mutex);
- }
- return ret;
-}
-
#ifdef CONFIG_DEBUG_KMEMLEAK
static void kmemleak_load_module(const struct module *mod,
const struct load_info *info)
@@ -2511,7 +2666,8 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
return err;
/* Suck in entire file: we'll want most of it. */
- info->hdr = vmalloc(info->len);
+ info->hdr = __vmalloc(info->len,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL);
if (!info->hdr)
return -ENOMEM;
@@ -2770,6 +2926,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
mod->trace_events = section_objs(info, "_ftrace_events",
sizeof(*mod->trace_events),
&mod->num_trace_events);
+ mod->trace_enums = section_objs(info, "_ftrace_enum_map",
+ sizeof(*mod->trace_enums),
+ &mod->num_trace_enums);
#endif
#ifdef CONFIG_TRACING
mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
@@ -2801,7 +2960,7 @@ static int move_module(struct module *mod, struct load_info *info)
void *ptr;
/* Do the allocs. */
- ptr = module_alloc_update_bounds(mod->core_size);
+ ptr = module_alloc(mod->core_size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. Just mark it as not being a
@@ -2815,7 +2974,7 @@ static int move_module(struct module *mod, struct load_info *info)
mod->module_core = ptr;
if (mod->init_size) {
- ptr = module_alloc_update_bounds(mod->init_size);
+ ptr = module_alloc(mod->init_size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. This block doesn't need to be
@@ -3103,7 +3262,7 @@ static noinline int do_init_module(struct module *mod)
*
* http://thread.gmane.org/gmane.linux.kernel/1420814
*/
- if (current->flags & PF_USED_ASYNC)
+ if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
async_synchronize_full();
mutex_lock(&module_mutex);
@@ -3115,6 +3274,7 @@ static noinline int do_init_module(struct module *mod)
mod->symtab = mod->core_symtab;
mod->strtab = mod->core_strtab;
#endif
+ mod_tree_remove_init(mod);
unset_module_init_ro_nx(mod);
module_arch_freeing_init(mod);
mod->module_init = NULL;
@@ -3123,11 +3283,11 @@ static noinline int do_init_module(struct module *mod)
mod->init_text_size = 0;
/*
* We want to free module_init, but be aware that kallsyms may be
- * walking this with preempt disabled. In all the failure paths,
- * we call synchronize_rcu/synchronize_sched, but we don't want
- * to slow down the success path, so use actual RCU here.
+ * walking this with preempt disabled. In all the failure paths, we
+ * call synchronize_sched(), but we don't want to slow down the success
+ * path, so use actual RCU here.
*/
- call_rcu(&freeinit->rcu, do_free_init);
+ call_rcu_sched(&freeinit->rcu, do_free_init);
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
@@ -3184,7 +3344,9 @@ again:
err = -EEXIST;
goto out;
}
+ mod_update_bounds(mod);
list_add_rcu(&mod->list, &modules);
+ mod_tree_insert(mod);
err = 0;
out:
@@ -3233,10 +3395,19 @@ out:
return err;
}
-static int unknown_module_param_cb(char *param, char *val, const char *modname)
+static int unknown_module_param_cb(char *param, char *val, const char *modname,
+ void *arg)
{
+ struct module *mod = arg;
+ int ret;
+
+ if (strcmp(param, "async_probe") == 0) {
+ mod->async_probe_requested = true;
+ return 0;
+ }
+
/* Check for magic 'dyndbg' arg */
- int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+ ret = ddebug_dyndbg_module_param_cb(param, val, modname);
if (ret != 0)
pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
return 0;
@@ -3291,6 +3462,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
if (err)
goto unlink_mod;
+ init_param_lock(mod);
+
/* Now we've got everything in the final locations, we can
* find optional sections. */
err = find_module_sections(mod, info);
@@ -3338,7 +3511,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Module is ready to execute: parsing args may do that. */
after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, unknown_module_param_cb);
+ -32768, 32767, NULL,
+ unknown_module_param_cb);
if (IS_ERR(after_dashes)) {
err = PTR_ERR(after_dashes);
goto bug_cleanup;
@@ -3366,6 +3540,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+
/* we can't deallocate the module until we clear memory protection */
unset_module_init_ro_nx(mod);
unset_module_core_ro_nx(mod);
@@ -3384,9 +3561,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
wake_up_all(&module_wq);
- /* Wait for RCU synchronizing before releasing mod->list. */
- synchronize_rcu();
+ /* Wait for RCU-sched synchronizing before releasing mod->list. */
+ synchronize_sched();
mutex_unlock(&module_mutex);
free_module:
/* Free lock-classes; relies on the preceding sync_rcu() */
@@ -3510,19 +3688,15 @@ const char *module_address_lookup(unsigned long addr,
char **modname,
char *namebuf)
{
- struct module *mod;
const char *ret = NULL;
+ struct module *mod;
preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- if (modname)
- *modname = mod->name;
- ret = get_ksymbol(mod, addr, size, offset);
- break;
- }
+ mod = __module_address(addr);
+ if (mod) {
+ if (modname)
+ *modname = mod->name;
+ ret = get_ksymbol(mod, addr, size, offset);
}
/* Make a copy in here where it's safe */
if (ret) {
@@ -3530,6 +3704,7 @@ const char *module_address_lookup(unsigned long addr,
ret = namebuf;
}
preempt_enable();
+
return ret;
}
@@ -3653,6 +3828,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
unsigned int i;
int ret;
+ module_assert_mutex();
+
list_for_each_entry(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
@@ -3827,13 +4004,15 @@ struct module *__module_address(unsigned long addr)
if (addr < module_addr_min || addr > module_addr_max)
return NULL;
- list_for_each_entry_rcu(mod, &modules, list) {
+ module_assert_mutex_or_preempt();
+
+ mod = mod_find(addr);
+ if (mod) {
+ BUG_ON(!within_module(addr, mod));
if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod))
- return mod;
+ mod = NULL;
}
- return NULL;
+ return mod;
}
EXPORT_SYMBOL_GPL(__module_address);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index be5b8fac4bd0..bd62f5cda746 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -10,11 +10,8 @@
*/
#include <linux/kernel.h>
-#include <linux/err.h>
-#include <crypto/public_key.h>
-#include <crypto/hash.h>
-#include <keys/asymmetric-type.h>
#include <keys/system_keyring.h>
+#include <crypto/public_key.h>
#include "module-internal.h"
/*
@@ -28,170 +25,22 @@
* - Information block
*/
struct module_signature {
- u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
- u8 hash; /* Digest algorithm [enum hash_algo] */
- u8 id_type; /* Key identifier type [enum pkey_id_type] */
- u8 signer_len; /* Length of signer's name */
- u8 key_id_len; /* Length of key identifier */
+ u8 algo; /* Public-key crypto algorithm [0] */
+ u8 hash; /* Digest algorithm [0] */
+ u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */
+ u8 signer_len; /* Length of signer's name [0] */
+ u8 key_id_len; /* Length of key identifier [0] */
u8 __pad[3];
__be32 sig_len; /* Length of signature data */
};
/*
- * Digest the module contents.
- */
-static struct public_key_signature *mod_make_digest(enum hash_algo hash,
- const void *mod,
- unsigned long modlen)
-{
- struct public_key_signature *pks;
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- size_t digest_size, desc_size;
- int ret;
-
- pr_devel("==>%s()\n", __func__);
-
- /* Allocate the hashing algorithm we're going to need and find out how
- * big the hash operational data will be.
- */
- tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
- if (IS_ERR(tfm))
- return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- digest_size = crypto_shash_digestsize(tfm);
-
- /* We allocate the hash operational data storage on the end of our
- * context data and the digest output buffer on the end of that.
- */
- ret = -ENOMEM;
- pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
- if (!pks)
- goto error_no_pks;
-
- pks->pkey_hash_algo = hash;
- pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
- pks->digest_size = digest_size;
-
- desc = (void *)pks + sizeof(*pks);
- desc->tfm = tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto error;
-
- ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
- if (ret < 0)
- goto error;
-
- crypto_free_shash(tfm);
- pr_devel("<==%s() = ok\n", __func__);
- return pks;
-
-error:
- kfree(pks);
-error_no_pks:
- crypto_free_shash(tfm);
- pr_devel("<==%s() = %d\n", __func__, ret);
- return ERR_PTR(ret);
-}
-
-/*
- * Extract an MPI array from the signature data. This represents the actual
- * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
- * size of the MPI in bytes.
- *
- * RSA signatures only have one MPI, so currently we only read one.
- */
-static int mod_extract_mpi_array(struct public_key_signature *pks,
- const void *data, size_t len)
-{
- size_t nbytes;
- MPI mpi;
-
- if (len < 3)
- return -EBADMSG;
- nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
- data += 2;
- len -= 2;
- if (len != nbytes)
- return -EBADMSG;
-
- mpi = mpi_read_raw_data(data, nbytes);
- if (!mpi)
- return -ENOMEM;
- pks->mpi[0] = mpi;
- pks->nr_mpi = 1;
- return 0;
-}
-
-/*
- * Request an asymmetric key.
- */
-static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
- const u8 *key_id, size_t key_id_len)
-{
- key_ref_t key;
- size_t i;
- char *id, *q;
-
- pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
-
- /* Construct an identifier. */
- id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
- if (!id)
- return ERR_PTR(-ENOKEY);
-
- memcpy(id, signer, signer_len);
-
- q = id + signer_len;
- *q++ = ':';
- *q++ = ' ';
- for (i = 0; i < key_id_len; i++) {
- *q++ = hex_asc[*key_id >> 4];
- *q++ = hex_asc[*key_id++ & 0x0f];
- }
-
- *q = 0;
-
- pr_debug("Look up: \"%s\"\n", id);
-
- key = keyring_search(make_key_ref(system_trusted_keyring, 1),
- &key_type_asymmetric, id);
- if (IS_ERR(key))
- pr_warn("Request for unknown module key '%s' err %ld\n",
- id, PTR_ERR(key));
- kfree(id);
-
- if (IS_ERR(key)) {
- switch (PTR_ERR(key)) {
- /* Hide some search errors */
- case -EACCES:
- case -ENOTDIR:
- case -EAGAIN:
- return ERR_PTR(-ENOKEY);
- default:
- return ERR_CAST(key);
- }
- }
-
- pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
- return key_ref_to_ptr(key);
-}
-
-/*
* Verify the signature on a module.
*/
int mod_verify_sig(const void *mod, unsigned long *_modlen)
{
- struct public_key_signature *pks;
struct module_signature ms;
- struct key *key;
- const void *sig;
size_t modlen = *_modlen, sig_len;
- int ret;
pr_devel("==>%s(,%zu)\n", __func__, modlen);
@@ -205,46 +54,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
if (sig_len >= modlen)
return -EBADMSG;
modlen -= sig_len;
- if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
- return -EBADMSG;
- modlen -= (size_t)ms.signer_len + ms.key_id_len;
-
*_modlen = modlen;
- sig = mod + modlen;
-
- /* For the moment, only support RSA and X.509 identifiers */
- if (ms.algo != PKEY_ALGO_RSA ||
- ms.id_type != PKEY_ID_X509)
- return -ENOPKG;
- if (ms.hash >= PKEY_HASH__LAST ||
- !hash_algo_name[ms.hash])
+ if (ms.id_type != PKEY_ID_PKCS7) {
+ pr_err("Module is not signed with expected PKCS#7 message\n");
return -ENOPKG;
-
- key = request_asymmetric_key(sig, ms.signer_len,
- sig + ms.signer_len, ms.key_id_len);
- if (IS_ERR(key))
- return PTR_ERR(key);
-
- pks = mod_make_digest(ms.hash, mod, modlen);
- if (IS_ERR(pks)) {
- ret = PTR_ERR(pks);
- goto error_put_key;
}
- ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
- sig_len);
- if (ret < 0)
- goto error_free_pks;
-
- ret = verify_signature(key, pks);
- pr_devel("verify_signature() = %d\n", ret);
+ if (ms.algo != 0 ||
+ ms.hash != 0 ||
+ ms.signer_len != 0 ||
+ ms.key_id_len != 0 ||
+ ms.__pad[0] != 0 ||
+ ms.__pad[1] != 0 ||
+ ms.__pad[2] != 0) {
+ pr_err("PKCS#7 signature info has unexpected non-zero params\n");
+ return -EBADMSG;
+ }
-error_free_pks:
- mpi_free(pks->rsa.s);
- kfree(pks);
-error_put_key:
- key_put(key);
- pr_devel("<==%s() = %d\n", __func__, ret);
- return ret;
+ return system_verify_data(mod, modlen, mod + modlen, sig_len,
+ VERIFYING_MODULE_SIGNATURE);
}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index ae9fc7cc360e..fd2c9acbcc19 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str,
.signr = sig,
};
+ RCU_LOCKDEP_WARN(!rcu_is_watching(),
+ "notify_die called but RCU thinks we're quiescent");
return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);
diff --git a/kernel/panic.c b/kernel/panic.c
index 8136ad76e5fd..04e91ff7560b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,7 +32,7 @@ static unsigned long tainted_mask;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
-static bool crash_kexec_post_notifiers;
+bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
@@ -142,7 +142,8 @@ void panic(const char *fmt, ...)
* Note: since some panic_notifiers can make crashed kernel
* more unstable, it can increase risks of the kdump failure too.
*/
- crash_kexec(NULL);
+ if (crash_kexec_post_notifiers)
+ crash_kexec(NULL);
bust_spinlocks(0);
diff --git a/kernel/params.c b/kernel/params.c
index 728e05b167de..b6554aa71094 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,15 +25,34 @@
#include <linux/slab.h>
#include <linux/ctype.h>
-/* Protects all parameters, and incidentally kmalloced_param list. */
+#ifdef CONFIG_SYSFS
+/* Protects all built-in parameters, modules use their own param_lock */
static DEFINE_MUTEX(param_lock);
+/* Use the module's mutex, or if built-in use the built-in mutex */
+#ifdef CONFIG_MODULES
+#define KPARAM_MUTEX(mod) ((mod) ? &(mod)->param_lock : &param_lock)
+#else
+#define KPARAM_MUTEX(mod) (&param_lock)
+#endif
+
+static inline void check_kparam_locked(struct module *mod)
+{
+ BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod)));
+}
+#else
+static inline void check_kparam_locked(struct module *mod)
+{
+}
+#endif /* !CONFIG_SYSFS */
+
/* This just allows us to keep track of which parameters are kmalloced. */
struct kmalloced_param {
struct list_head list;
char val[];
};
static LIST_HEAD(kmalloced_params);
+static DEFINE_SPINLOCK(kmalloced_params_lock);
static void *kmalloc_parameter(unsigned int size)
{
@@ -43,7 +62,10 @@ static void *kmalloc_parameter(unsigned int size)
if (!p)
return NULL;
+ spin_lock(&kmalloced_params_lock);
list_add(&p->list, &kmalloced_params);
+ spin_unlock(&kmalloced_params_lock);
+
return p->val;
}
@@ -52,6 +74,7 @@ static void maybe_kfree_parameter(void *param)
{
struct kmalloced_param *p;
+ spin_lock(&kmalloced_params_lock);
list_for_each_entry(p, &kmalloced_params, list) {
if (p->val == param) {
list_del(&p->list);
@@ -59,6 +82,7 @@ static void maybe_kfree_parameter(void *param)
break;
}
}
+ spin_unlock(&kmalloced_params_lock);
}
static char dash2underscore(char c)
@@ -100,8 +124,9 @@ static int parse_one(char *param,
unsigned num_params,
s16 min_level,
s16 max_level,
+ void *arg,
int (*handle_unknown)(char *param, char *val,
- const char *doing))
+ const char *doing, void *arg))
{
unsigned int i;
int err;
@@ -118,17 +143,17 @@ static int parse_one(char *param,
return -EINVAL;
pr_debug("handling %s with %p\n", param,
params[i].ops->set);
- mutex_lock(&param_lock);
+ kernel_param_lock(params[i].mod);
param_check_unsafe(&params[i]);
err = params[i].ops->set(val, &params[i]);
- mutex_unlock(&param_lock);
+ kernel_param_unlock(params[i].mod);
return err;
}
}
if (handle_unknown) {
pr_debug("doing %s: %s='%s'\n", doing, param, val);
- return handle_unknown(param, val, doing);
+ return handle_unknown(param, val, doing, arg);
}
pr_debug("Unknown argument '%s'\n", param);
@@ -173,9 +198,9 @@ static char *next_arg(char *args, char **param, char **val)
if (args[i-1] == '"')
args[i-1] = '\0';
}
- if (quoted && args[i-1] == '"')
- args[i-1] = '\0';
}
+ if (quoted && args[i-1] == '"')
+ args[i-1] = '\0';
if (args[i]) {
args[i] = '\0';
@@ -194,7 +219,9 @@ char *parse_args(const char *doing,
unsigned num,
s16 min_level,
s16 max_level,
- int (*unknown)(char *param, char *val, const char *doing))
+ void *arg,
+ int (*unknown)(char *param, char *val,
+ const char *doing, void *arg))
{
char *param, *val;
@@ -214,7 +241,7 @@ char *parse_args(const char *doing,
return args;
irq_was_disabled = irqs_disabled();
ret = parse_one(param, val, doing, params, num,
- min_level, max_level, unknown);
+ min_level, max_level, arg, unknown);
if (irq_was_disabled && !irqs_disabled())
pr_warn("%s: option '%s' enabled irq's!\n",
doing, param);
@@ -251,7 +278,7 @@ char *parse_args(const char *doing,
return scnprintf(buffer, PAGE_SIZE, format, \
*((type *)kp->arg)); \
} \
- struct kernel_param_ops param_ops_##name = { \
+ const struct kernel_param_ops param_ops_##name = { \
.set = param_set_##name, \
.get = param_get_##name, \
}; \
@@ -303,7 +330,7 @@ static void param_free_charp(void *arg)
maybe_kfree_parameter(*((char **)arg));
}
-struct kernel_param_ops param_ops_charp = {
+const struct kernel_param_ops param_ops_charp = {
.set = param_set_charp,
.get = param_get_charp,
.free = param_free_charp,
@@ -328,13 +355,44 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_get_bool);
-struct kernel_param_ops param_ops_bool = {
+const struct kernel_param_ops param_ops_bool = {
.flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool,
.get = param_get_bool,
};
EXPORT_SYMBOL(param_ops_bool);
+int param_set_bool_enable_only(const char *val, const struct kernel_param *kp)
+{
+ int err = 0;
+ bool new_value;
+ bool orig_value = *(bool *)kp->arg;
+ struct kernel_param dummy_kp = *kp;
+
+ dummy_kp.arg = &new_value;
+
+ err = param_set_bool(val, &dummy_kp);
+ if (err)
+ return err;
+
+ /* Don't let them unset it once it's set! */
+ if (!new_value && orig_value)
+ return -EROFS;
+
+ if (new_value)
+ err = param_set_bool(val, kp);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(param_set_bool_enable_only);
+
+const struct kernel_param_ops param_ops_bool_enable_only = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = param_set_bool_enable_only,
+ .get = param_get_bool,
+};
+EXPORT_SYMBOL_GPL(param_ops_bool_enable_only);
+
/* This one must be bool. */
int param_set_invbool(const char *val, const struct kernel_param *kp)
{
@@ -356,7 +414,7 @@ int param_get_invbool(char *buffer, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_get_invbool);
-struct kernel_param_ops param_ops_invbool = {
+const struct kernel_param_ops param_ops_invbool = {
.set = param_set_invbool,
.get = param_get_invbool,
};
@@ -364,12 +422,11 @@ EXPORT_SYMBOL(param_ops_invbool);
int param_set_bint(const char *val, const struct kernel_param *kp)
{
- struct kernel_param boolkp;
+ /* Match bool exactly, by re-using it. */
+ struct kernel_param boolkp = *kp;
bool v;
int ret;
- /* Match bool exactly, by re-using it. */
- boolkp = *kp;
boolkp.arg = &v;
ret = param_set_bool(val, &boolkp);
@@ -379,7 +436,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_set_bint);
-struct kernel_param_ops param_ops_bint = {
+const struct kernel_param_ops param_ops_bint = {
.flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bint,
.get = param_get_int,
@@ -387,7 +444,8 @@ struct kernel_param_ops param_ops_bint = {
EXPORT_SYMBOL(param_ops_bint);
/* We break the rule and mangle the string. */
-static int param_array(const char *name,
+static int param_array(struct module *mod,
+ const char *name,
const char *val,
unsigned int min, unsigned int max,
void *elem, int elemsize,
@@ -418,7 +476,7 @@ static int param_array(const char *name,
/* nul-terminate and parse */
save = val[len];
((char *)val)[len] = '\0';
- BUG_ON(!mutex_is_locked(&param_lock));
+ check_kparam_locked(mod);
ret = set(val, &kp);
if (ret != 0)
@@ -440,7 +498,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
const struct kparam_array *arr = kp->arr;
unsigned int temp_num;
- return param_array(kp->name, val, 1, arr->max, arr->elem,
+ return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem,
arr->elemsize, arr->ops->set, kp->level,
arr->num ?: &temp_num);
}
@@ -449,14 +507,13 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
{
int i, off, ret;
const struct kparam_array *arr = kp->arr;
- struct kernel_param p;
+ struct kernel_param p = *kp;
- p = *kp;
for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
if (i)
buffer[off++] = ',';
p.arg = arr->elem + arr->elemsize * i;
- BUG_ON(!mutex_is_locked(&param_lock));
+ check_kparam_locked(p.mod);
ret = arr->ops->get(buffer + off, &p);
if (ret < 0)
return ret;
@@ -476,7 +533,7 @@ static void param_array_free(void *arg)
arr->ops->free(arr->elem + arr->elemsize * i);
}
-struct kernel_param_ops param_array_ops = {
+const struct kernel_param_ops param_array_ops = {
.set = param_array_set,
.get = param_array_get,
.free = param_array_free,
@@ -504,7 +561,7 @@ int param_get_string(char *buffer, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_get_string);
-struct kernel_param_ops param_ops_string = {
+const struct kernel_param_ops param_ops_string = {
.set = param_set_copystring,
.get = param_get_string,
};
@@ -539,9 +596,9 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
if (!attribute->param->ops->get)
return -EPERM;
- mutex_lock(&param_lock);
+ kernel_param_lock(mk->mod);
count = attribute->param->ops->get(buf, attribute->param);
- mutex_unlock(&param_lock);
+ kernel_param_unlock(mk->mod);
if (count > 0) {
strcat(buf, "\n");
++count;
@@ -551,7 +608,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
/* sysfs always hands a nul-terminated string in buf. We rely on that. */
static ssize_t param_attr_store(struct module_attribute *mattr,
- struct module_kobject *km,
+ struct module_kobject *mk,
const char *buf, size_t len)
{
int err;
@@ -560,10 +617,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
if (!attribute->param->ops->set)
return -EPERM;
- mutex_lock(&param_lock);
+ kernel_param_lock(mk->mod);
param_check_unsafe(attribute->param);
err = attribute->param->ops->set(buf, attribute->param);
- mutex_unlock(&param_lock);
+ kernel_param_unlock(mk->mod);
if (!err)
return len;
return err;
@@ -577,17 +634,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
#endif
#ifdef CONFIG_SYSFS
-void __kernel_param_lock(void)
+void kernel_param_lock(struct module *mod)
{
- mutex_lock(&param_lock);
+ mutex_lock(KPARAM_MUTEX(mod));
}
-EXPORT_SYMBOL(__kernel_param_lock);
-void __kernel_param_unlock(void)
+void kernel_param_unlock(struct module *mod)
{
- mutex_unlock(&param_lock);
+ mutex_unlock(KPARAM_MUTEX(mod));
}
-EXPORT_SYMBOL(__kernel_param_unlock);
+
+EXPORT_SYMBOL(kernel_param_lock);
+EXPORT_SYMBOL(kernel_param_unlock);
/*
* add_sysfs_param - add a parameter to sysfs
@@ -853,6 +911,7 @@ static void __init version_sysfs_builtin(void)
mk = locate_module_kobject(vattr->module_name);
if (mk) {
err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+ WARN_ON_ONCE(err);
kobject_uevent(&mk->kobj, KOBJ_ADD);
kobject_put(&mk->kobj);
}
diff --git a/kernel/pid.c b/kernel/pid.c
index cd36a5e0d173..ca368793808e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -182,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
spin_unlock_irq(&pidmap_lock);
kfree(page);
if (unlikely(!map->page))
- break;
+ return -ENOMEM;
}
if (likely(atomic_read(&map->nr_free))) {
for ( ; ; ) {
@@ -210,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
}
pid = mk_pid(pid_ns, map, offset);
}
- return -1;
+ return -EAGAIN;
}
int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
@@ -301,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns)
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
+ int retval = -ENOMEM;
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
- goto out;
+ return ERR_PTR(retval);
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
- if (nr < 0)
+ if (IS_ERR_VALUE(nr)) {
+ retval = nr;
goto out_free;
+ }
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
@@ -339,7 +342,6 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
spin_unlock_irq(&pidmap_lock);
-out:
return pid;
out_unlock:
@@ -351,8 +353,7 @@ out_free:
free_pidmap(pid->numbers + i);
kmem_cache_free(ns->pid_cachep, pid);
- pid = NULL;
- goto out;
+ return ERR_PTR(retval);
}
void disable_pid_allocation(struct pid_namespace *ns)
@@ -450,9 +451,8 @@ EXPORT_SYMBOL(pid_task);
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
- rcu_lockdep_assert(rcu_read_lock_held(),
- "find_task_by_pid_ns() needs rcu_read_lock()"
- " protection");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+ "find_task_by_pid_ns() needs rcu_read_lock() protection");
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 7e01f78f0417..02e8dfaa1ce2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,16 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config SUSPEND_SKIP_SYNC
+ bool "Skip kernel's sys_sync() on suspend to RAM/standby"
+ depends on SUSPEND
+ depends on EXPERT
+ help
+ Skip the kernel sys_sync() before freezing user processes.
+ Some systems prefer not to pay this cost on every invocation
+ of suspend, or they are content with invoking sync() from
+ user-space before invoking suspend. Say Y if that's your case.
+
config HIBERNATE_CALLBACKS
bool
@@ -187,7 +197,7 @@ config DPM_WATCHDOG
config DPM_WATCHDOG_TIMEOUT
int "Watchdog timeout in seconds"
range 1 120
- default 12
+ default 60
depends on DPM_WATCHDOG
config PM_TRACE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 29472bff11ef..cb880a14cc39 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,8 +7,7 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
-obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
- block_io.o
+obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o
obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
deleted file mode 100644
index 9a58bc258810..000000000000
--- a/kernel/power/block_io.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * This file provides functions for block I/O operations on swap/file.
- *
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
- * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/bio.h>
-#include <linux/kernel.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-
-#include "power.h"
-
-/**
- * submit - submit BIO request.
- * @rw: READ or WRITE.
- * @off physical offset of page.
- * @page: page we're reading or writing.
- * @bio_chain: list of pending biod (for async reading)
- *
- * Straight from the textbook - allocate and initialize the bio.
- * If we're reading, make sure the page is marked as dirty.
- * Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, struct block_device *bdev, sector_t sector,
- struct page *page, struct bio **bio_chain)
-{
- const int bio_rw = rw | REQ_SYNC;
- struct bio *bio;
-
- bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
- bio->bi_iter.bi_sector = sector;
- bio->bi_bdev = bdev;
- bio->bi_end_io = end_swap_bio_read;
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
- (unsigned long long)sector);
- bio_put(bio);
- return -EFAULT;
- }
-
- lock_page(page);
- bio_get(bio);
-
- if (bio_chain == NULL) {
- submit_bio(bio_rw, bio);
- wait_on_page_locked(page);
- if (rw == READ)
- bio_set_pages_dirty(bio);
- bio_put(bio);
- } else {
- if (rw == READ)
- get_page(page); /* These pages are freed later */
- bio->bi_private = *bio_chain;
- *bio_chain = bio;
- submit_bio(bio_rw, bio);
- }
- return 0;
-}
-
-int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
- return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
- virt_to_page(addr), bio_chain);
-}
-
-int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
- return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
- virt_to_page(addr), bio_chain);
-}
-
-int hib_wait_on_bio_chain(struct bio **bio_chain)
-{
- struct bio *bio;
- struct bio *next_bio;
- int ret = 0;
-
- if (bio_chain == NULL)
- return 0;
-
- bio = *bio_chain;
- if (bio == NULL)
- return 0;
- while (bio) {
- struct page *page;
-
- next_bio = bio->bi_private;
- page = bio->bi_io_vec[0].bv_page;
- wait_on_page_locked(page);
- if (!PageUptodate(page) || PageError(page))
- ret = -EIO;
- put_page(page);
- bio_put(bio);
- bio = next_bio;
- }
- *bio_chain = NULL;
- return ret;
-}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 2329daae5255..690f78f210f2 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -552,7 +552,7 @@ int hibernation_platform_enter(void)
error = disable_nonboot_cpus();
if (error)
- goto Platform_finish;
+ goto Enable_cpus;
local_irq_disable();
syscore_suspend();
@@ -568,6 +568,8 @@ int hibernation_platform_enter(void)
Power_up:
syscore_resume();
local_irq_enable();
+
+ Enable_cpus:
enable_nonboot_cpus();
Platform_finish:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9a59d042ea84..63d395b5df93 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,7 +11,7 @@
#include <linux/export.h>
#include <linux/kobject.h>
#include <linux/string.h>
-#include <linux/resume-trace.h>
+#include <linux/pm-trace.h>
#include <linux/workqueue.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -272,7 +272,7 @@ static inline void pm_print_times_init(void)
{
pm_print_times_enabled = !!initcall_debug;
}
-#else /* !CONFIG_PP_SLEEP_DEBUG */
+#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ce9b8328a689..caadb566e82b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -163,15 +163,6 @@ extern void swsusp_close(fmode_t);
extern int swsusp_unmark(void);
#endif
-/* kernel/power/block_io.c */
-extern struct block_device *hib_resume_bdev;
-
-extern int hib_bio_read_page(pgoff_t page_off, void *addr,
- struct bio **bio_chain);
-extern int hib_bio_write_page(pgoff_t page_off, void *addr,
- struct bio **bio_chain);
-extern int hib_wait_on_bio_chain(struct bio **bio_chain);
-
struct timeval;
/* kernel/power/swsusp.c */
extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b7d6b3a721b1..7e4cda4a8dd9 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -28,6 +28,7 @@
#include <linux/ftrace.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
+#include <linux/moduleparam.h>
#include "power.h"
@@ -233,12 +234,20 @@ static bool platform_suspend_again(suspend_state_t state)
suspend_ops->suspend_again() : false;
}
+#ifdef CONFIG_PM_DEBUG
+static unsigned int pm_test_delay = 5;
+module_param(pm_test_delay, uint, 0644);
+MODULE_PARM_DESC(pm_test_delay,
+ "Number of seconds to wait before resuming from suspend test");
+#endif
+
static int suspend_test(int level)
{
#ifdef CONFIG_PM_DEBUG
if (pm_test_level == level) {
- printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
- mdelay(5000);
+ printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n",
+ pm_test_delay);
+ mdelay(pm_test_delay * 1000);
return 1;
}
#endif /* !CONFIG_PM_DEBUG */
@@ -357,6 +366,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
trace_suspend_resume(TPS("machine_suspend"),
state, false);
events_check_enabled = false;
+ } else if (*wakeup) {
+ error = -EBUSY;
}
syscore_resume();
}
@@ -459,7 +470,7 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_FREEZE) {
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
- pr_warning("PM: Unsupported test mode for freeze state,"
+ pr_warning("PM: Unsupported test mode for suspend to idle,"
"please choose none/freezer/devices/platform.\n");
return -EAGAIN;
}
@@ -473,13 +484,15 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_FREEZE)
freeze_begin();
+#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+#endif
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+ pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
error = suspend_prepare(state);
if (error)
goto Unlock;
@@ -488,7 +501,7 @@ static int enter_state(suspend_state_t state)
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
- pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+ pr_debug("PM: Suspending system (%s)\n", pm_states[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 570aff817543..b2066fb5b10f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -212,7 +212,80 @@ int swsusp_swap_in_use(void)
*/
static unsigned short root_swap = 0xffff;
-struct block_device *hib_resume_bdev;
+static struct block_device *hib_resume_bdev;
+
+struct hib_bio_batch {
+ atomic_t count;
+ wait_queue_head_t wait;
+ int error;
+};
+
+static void hib_init_batch(struct hib_bio_batch *hb)
+{
+ atomic_set(&hb->count, 0);
+ init_waitqueue_head(&hb->wait);
+ hb->error = 0;
+}
+
+static void hib_end_io(struct bio *bio)
+{
+ struct hib_bio_batch *hb = bio->bi_private;
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ if (bio->bi_error) {
+ printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ }
+
+ if (bio_data_dir(bio) == WRITE)
+ put_page(page);
+
+ if (bio->bi_error && !hb->error)
+ hb->error = bio->bi_error;
+ if (atomic_dec_and_test(&hb->count))
+ wake_up(&hb->wait);
+
+ bio_put(bio);
+}
+
+static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+ struct hib_bio_batch *hb)
+{
+ struct page *page = virt_to_page(addr);
+ struct bio *bio;
+ int error = 0;
+
+ bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+ bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
+ bio->bi_bdev = hib_resume_bdev;
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
+ (unsigned long long)bio->bi_iter.bi_sector);
+ bio_put(bio);
+ return -EFAULT;
+ }
+
+ if (hb) {
+ bio->bi_end_io = hib_end_io;
+ bio->bi_private = hb;
+ atomic_inc(&hb->count);
+ submit_bio(rw, bio);
+ } else {
+ error = submit_bio_wait(rw, bio);
+ bio_put(bio);
+ }
+
+ return error;
+}
+
+static int hib_wait_io(struct hib_bio_batch *hb)
+{
+ wait_event(hb->wait, atomic_read(&hb->count) == 0);
+ return hb->error;
+}
/*
* Saving part
@@ -222,7 +295,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -231,7 +304,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_bio_write_page(swsusp_resume_block,
+ error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Swap header not found!\n");
@@ -271,10 +344,10 @@ static int swsusp_swap_check(void)
* write_page - Write one page to given swap location.
* @buf: Address we're writing.
* @offset: Offset of the swap page we're writing to.
- * @bio_chain: Link the next write BIO here
+ * @hb: bio completion batch
*/
-static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
{
void *src;
int ret;
@@ -282,13 +355,13 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
if (!offset)
return -ENOSPC;
- if (bio_chain) {
+ if (hb) {
src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
__GFP_NORETRY);
if (src) {
copy_page(src, buf);
} else {
- ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
+ ret = hib_wait_io(hb); /* Free pages */
if (ret)
return ret;
src = (void *)__get_free_page(__GFP_WAIT |
@@ -298,14 +371,14 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
copy_page(src, buf);
} else {
WARN_ON_ONCE(1);
- bio_chain = NULL; /* Go synchronous */
+ hb = NULL; /* Go synchronous */
src = buf;
}
}
} else {
src = buf;
}
- return hib_bio_write_page(offset, src, bio_chain);
+ return hib_submit_io(WRITE_SYNC, offset, src, hb);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -348,7 +421,7 @@ err_close:
}
static int swap_write_page(struct swap_map_handle *handle, void *buf,
- struct bio **bio_chain)
+ struct hib_bio_batch *hb)
{
int error = 0;
sector_t offset;
@@ -356,7 +429,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
if (!handle->cur)
return -EINVAL;
offset = alloc_swapdev_block(root_swap);
- error = write_page(buf, offset, bio_chain);
+ error = write_page(buf, offset, hb);
if (error)
return error;
handle->cur->entries[handle->k++] = offset;
@@ -365,15 +438,15 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
if (!offset)
return -ENOSPC;
handle->cur->next_swap = offset;
- error = write_page(handle->cur, handle->cur_swap, bio_chain);
+ error = write_page(handle->cur, handle->cur_swap, hb);
if (error)
goto out;
clear_page(handle->cur);
handle->cur_swap = offset;
handle->k = 0;
- if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
- error = hib_wait_on_bio_chain(bio_chain);
+ if (hb && low_free_pages() <= handle->reqd_free_pages) {
+ error = hib_wait_io(hb);
if (error)
goto out;
/*
@@ -445,23 +518,24 @@ static int save_image(struct swap_map_handle *handle,
int ret;
int nr_pages;
int err2;
- struct bio *bio;
+ struct hib_bio_batch hb;
ktime_t start;
ktime_t stop;
+ hib_init_batch(&hb);
+
printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
nr_to_write);
m = nr_to_write / 10;
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
while (1) {
ret = snapshot_read_next(snapshot);
if (ret <= 0)
break;
- ret = swap_write_page(handle, data_of(*snapshot), &bio);
+ ret = swap_write_page(handle, data_of(*snapshot), &hb);
if (ret)
break;
if (!(nr_pages % m))
@@ -469,7 +543,7 @@ static int save_image(struct swap_map_handle *handle,
nr_pages / m * 10);
nr_pages++;
}
- err2 = hib_wait_on_bio_chain(&bio);
+ err2 = hib_wait_io(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
@@ -580,7 +654,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
int ret = 0;
int nr_pages;
int err2;
- struct bio *bio;
+ struct hib_bio_batch hb;
ktime_t start;
ktime_t stop;
size_t off;
@@ -589,6 +663,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
struct cmp_data *data = NULL;
struct crc_data *crc = NULL;
+ hib_init_batch(&hb);
+
/*
* We'll limit the number of threads for compression to limit memory
* footprint.
@@ -674,7 +750,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
for (;;) {
for (thr = 0; thr < nr_threads; thr++) {
@@ -748,7 +823,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
off += PAGE_SIZE) {
memcpy(page, data[thr].cmp + off, PAGE_SIZE);
- ret = swap_write_page(handle, page, &bio);
+ ret = swap_write_page(handle, page, &hb);
if (ret)
goto out_finish;
}
@@ -759,7 +834,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
}
out_finish:
- err2 = hib_wait_on_bio_chain(&bio);
+ err2 = hib_wait_io(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
@@ -906,7 +981,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_bio_read_page(offset, tmp->map, NULL);
+ error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -919,7 +994,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
}
static int swap_read_page(struct swap_map_handle *handle, void *buf,
- struct bio **bio_chain)
+ struct hib_bio_batch *hb)
{
sector_t offset;
int error;
@@ -930,7 +1005,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_bio_read_page(offset, buf, bio_chain);
+ error = hib_submit_io(READ_SYNC, offset, buf, hb);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -968,27 +1043,28 @@ static int load_image(struct swap_map_handle *handle,
int ret = 0;
ktime_t start;
ktime_t stop;
- struct bio *bio;
+ struct hib_bio_batch hb;
int err2;
unsigned nr_pages;
+ hib_init_batch(&hb);
+
printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
nr_to_read);
m = nr_to_read / 10;
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
for ( ; ; ) {
ret = snapshot_write_next(snapshot);
if (ret <= 0)
break;
- ret = swap_read_page(handle, data_of(*snapshot), &bio);
+ ret = swap_read_page(handle, data_of(*snapshot), &hb);
if (ret)
break;
if (snapshot->sync_read)
- ret = hib_wait_on_bio_chain(&bio);
+ ret = hib_wait_io(&hb);
if (ret)
break;
if (!(nr_pages % m))
@@ -996,7 +1072,7 @@ static int load_image(struct swap_map_handle *handle,
nr_pages / m * 10);
nr_pages++;
}
- err2 = hib_wait_on_bio_chain(&bio);
+ err2 = hib_wait_io(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
@@ -1067,7 +1143,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
unsigned int m;
int ret = 0;
int eof = 0;
- struct bio *bio;
+ struct hib_bio_batch hb;
ktime_t start;
ktime_t stop;
unsigned nr_pages;
@@ -1080,6 +1156,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
struct dec_data *data = NULL;
struct crc_data *crc = NULL;
+ hib_init_batch(&hb);
+
/*
* We'll limit the number of threads for decompression to limit memory
* footprint.
@@ -1190,7 +1268,6 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
ret = snapshot_write_next(snapshot);
@@ -1199,7 +1276,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
for(;;) {
for (i = 0; !eof && i < want; i++) {
- ret = swap_read_page(handle, page[ring], &bio);
+ ret = swap_read_page(handle, page[ring], &hb);
if (ret) {
/*
* On real read error, finish. On end of data,
@@ -1226,7 +1303,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (!asked)
break;
- ret = hib_wait_on_bio_chain(&bio);
+ ret = hib_wait_io(&hb);
if (ret)
goto out_finish;
have += asked;
@@ -1281,7 +1358,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
* Wait for more data while we are decompressing.
*/
if (have < LZO_CMP_PAGES && asked) {
- ret = hib_wait_on_bio_chain(&bio);
+ ret = hib_wait_io(&hb);
if (ret)
goto out_finish;
have += asked;
@@ -1430,7 +1507,7 @@ int swsusp_check(void)
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
- error = hib_bio_read_page(swsusp_resume_block,
+ error = hib_submit_io(READ_SYNC, swsusp_resume_block,
swsusp_header, NULL);
if (error)
goto put;
@@ -1438,7 +1515,7 @@ int swsusp_check(void)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
- error = hib_bio_write_page(swsusp_resume_block,
+ error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
swsusp_header, NULL);
} else {
error = -EINVAL;
@@ -1482,10 +1559,10 @@ int swsusp_unmark(void)
{
int error;
- hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_bio_write_page(swsusp_resume_block,
+ error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 019069c84ff6..1896386e16bb 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -17,6 +17,7 @@
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include "power.h"
@@ -83,7 +84,9 @@ static inline void decrement_wakelocks_number(void) {}
#define WL_GC_COUNT_MAX 100
#define WL_GC_TIME_SEC 300
+static void __wakelocks_gc(struct work_struct *work);
static LIST_HEAD(wakelocks_lru_list);
+static DECLARE_WORK(wakelock_work, __wakelocks_gc);
static unsigned int wakelocks_gc_count;
static inline void wakelocks_lru_add(struct wakelock *wl)
@@ -96,13 +99,12 @@ static inline void wakelocks_lru_most_recent(struct wakelock *wl)
list_move(&wl->lru, &wakelocks_lru_list);
}
-static void wakelocks_gc(void)
+static void __wakelocks_gc(struct work_struct *work)
{
struct wakelock *wl, *aux;
ktime_t now;
- if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
- return;
+ mutex_lock(&wakelocks_lock);
now = ktime_get();
list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
@@ -127,6 +129,16 @@ static void wakelocks_gc(void)
}
}
wakelocks_gc_count = 0;
+
+ mutex_unlock(&wakelocks_lock);
+}
+
+static void wakelocks_gc(void)
+{
+ if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+ return;
+
+ schedule_work(&wakelock_work);
}
#else /* !CONFIG_PM_WAKELOCKS_GC */
static inline void wakelocks_lru_add(struct wakelock *wl) {}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bb0635bd74f2..cf8c24203368 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,6 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
-#include <linux/aio.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
@@ -46,6 +45,7 @@
#include <linux/irq_work.h>
#include <linux/utsname.h>
#include <linux/ctype.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
@@ -85,6 +85,18 @@ static struct lockdep_map console_lock_dep_map = {
#endif
/*
+ * Number of registered extended console drivers.
+ *
+ * If extended consoles are present, in-kernel cont reassembly is disabled
+ * and each fragment is stored as a separate log entry with proper
+ * continuation flag so that every emitted message has full metadata. This
+ * doesn't change the result for regular consoles or /proc/kmsg. For
+ * /dev/kmsg, as long as the reader concatenates messages according to
+ * consecutive continuation flags, the end result should be the same too.
+ */
+static int nr_ext_console_drivers;
+
+/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
* macros instead of functions so that _RET_IP_ contains useful information.
*/
@@ -195,14 +207,14 @@ static int console_may_schedule;
* need to be changed in the future, when the requirements change.
*
* /dev/kmsg exports the structured data in the following line format:
- * "level,sequnum,timestamp;<message text>\n"
+ * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n"
+ *
+ * Users of the export format should ignore possible additional values
+ * separated by ',', and find the message after the ';' character.
*
* The optional key/value pairs are attached as continuation lines starting
* with a space character and terminated by a newline. All possible
* non-prinatable characters are escaped in the "\xff" notation.
- *
- * Users of the export format should ignore possible additional values
- * separated by ',', and find the message after the ';' character.
*/
enum log_flags {
@@ -477,18 +489,18 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-int check_syslog_permissions(int type, bool from_file)
+int check_syslog_permissions(int type, int source)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
* already done the capabilities checks at open time.
*/
- if (from_file && type != SYSLOG_ACTION_OPEN)
- return 0;
+ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
+ goto ok;
if (syslog_action_restricted(type)) {
if (capable(CAP_SYSLOG))
- return 0;
+ goto ok;
/*
* For historical reasons, accept CAP_SYS_ADMIN too, with
* a warning.
@@ -498,13 +510,94 @@ int check_syslog_permissions(int type, bool from_file)
"CAP_SYS_ADMIN but no CAP_SYSLOG "
"(deprecated).\n",
current->comm, task_pid_nr(current));
- return 0;
+ goto ok;
}
return -EPERM;
}
+ok:
return security_syslog(type);
}
+static void append_char(char **pp, char *e, char c)
+{
+ if (*pp < e)
+ *(*pp)++ = c;
+}
+
+static ssize_t msg_print_ext_header(char *buf, size_t size,
+ struct printk_log *msg, u64 seq,
+ enum log_flags prev_flags)
+{
+ u64 ts_usec = msg->ts_nsec;
+ char cont = '-';
+
+ do_div(ts_usec, 1000);
+
+ /*
+ * If we couldn't merge continuation line fragments during the print,
+ * export the stored flags to allow an optional external merge of the
+ * records. Merging the records isn't always neccessarily correct, like
+ * when we hit a race during printing. In most cases though, it produces
+ * better readable output. 'c' in the record flags mark the first
+ * fragment of a line, '+' the following.
+ */
+ if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT))
+ cont = 'c';
+ else if ((msg->flags & LOG_CONT) ||
+ ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+ cont = '+';
+
+ return scnprintf(buf, size, "%u,%llu,%llu,%c;",
+ (msg->facility << 3) | msg->level, seq, ts_usec, cont);
+}
+
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *dict, size_t dict_len,
+ char *text, size_t text_len)
+{
+ char *p = buf, *e = buf + size;
+ size_t i;
+
+ /* escape non-printable characters */
+ for (i = 0; i < text_len; i++) {
+ unsigned char c = text[i];
+
+ if (c < ' ' || c >= 127 || c == '\\')
+ p += scnprintf(p, e - p, "\\x%02x", c);
+ else
+ append_char(&p, e, c);
+ }
+ append_char(&p, e, '\n');
+
+ if (dict_len) {
+ bool line = true;
+
+ for (i = 0; i < dict_len; i++) {
+ unsigned char c = dict[i];
+
+ if (line) {
+ append_char(&p, e, ' ');
+ line = false;
+ }
+
+ if (c == '\0') {
+ append_char(&p, e, '\n');
+ line = true;
+ continue;
+ }
+
+ if (c < ' ' || c >= 127 || c == '\\') {
+ p += scnprintf(p, e - p, "\\x%02x", c);
+ continue;
+ }
+
+ append_char(&p, e, c);
+ }
+ append_char(&p, e, '\n');
+ }
+
+ return p - buf;
+}
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
@@ -512,7 +605,7 @@ struct devkmsg_user {
u32 idx;
enum log_flags prev;
struct mutex lock;
- char buf[8192];
+ char buf[CONSOLE_EXT_LOG_MAX];
};
static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
@@ -521,7 +614,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
int i;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
- size_t len = iocb->ki_nbytes;
+ size_t len = iov_iter_count(from);
ssize_t ret = len;
if (len > LOG_LINE_MAX)
@@ -570,9 +663,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
{
struct devkmsg_user *user = file->private_data;
struct printk_log *msg;
- u64 ts_usec;
- size_t i;
- char cont = '-';
size_t len;
ssize_t ret;
@@ -608,66 +698,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
}
msg = log_from_idx(user->idx);
- ts_usec = msg->ts_nsec;
- do_div(ts_usec, 1000);
-
- /*
- * If we couldn't merge continuation line fragments during the print,
- * export the stored flags to allow an optional external merge of the
- * records. Merging the records isn't always neccessarily correct, like
- * when we hit a race during printing. In most cases though, it produces
- * better readable output. 'c' in the record flags mark the first
- * fragment of a line, '+' the following.
- */
- if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
- cont = 'c';
- else if ((msg->flags & LOG_CONT) ||
- ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
- cont = '+';
+ len = msg_print_ext_header(user->buf, sizeof(user->buf),
+ msg, user->seq, user->prev);
+ len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
+ log_dict(msg), msg->dict_len,
+ log_text(msg), msg->text_len);
- len = sprintf(user->buf, "%u,%llu,%llu,%c;",
- (msg->facility << 3) | msg->level,
- user->seq, ts_usec, cont);
user->prev = msg->flags;
-
- /* escape non-printable characters */
- for (i = 0; i < msg->text_len; i++) {
- unsigned char c = log_text(msg)[i];
-
- if (c < ' ' || c >= 127 || c == '\\')
- len += sprintf(user->buf + len, "\\x%02x", c);
- else
- user->buf[len++] = c;
- }
- user->buf[len++] = '\n';
-
- if (msg->dict_len) {
- bool line = true;
-
- for (i = 0; i < msg->dict_len; i++) {
- unsigned char c = log_dict(msg)[i];
-
- if (line) {
- user->buf[len++] = ' ';
- line = false;
- }
-
- if (c == '\0') {
- user->buf[len++] = '\n';
- line = true;
- continue;
- }
-
- if (c < ' ' || c >= 127 || c == '\\') {
- len += sprintf(user->buf + len, "\\x%02x", c);
- continue;
- }
-
- user->buf[len++] = c;
- }
- user->buf[len++] = '\n';
- }
-
user->idx = log_next(user->idx);
user->seq++;
raw_spin_unlock_irq(&logbuf_lock);
@@ -1253,20 +1290,16 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
return len;
}
-int do_syslog(int type, char __user *buf, int len, bool from_file)
+int do_syslog(int type, char __user *buf, int len, int source)
{
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
- error = check_syslog_permissions(type, from_file);
+ error = check_syslog_permissions(type, source);
if (error)
goto out;
- error = security_syslog(type);
- if (error)
- return error;
-
switch (type) {
case SYSLOG_ACTION_CLOSE: /* Close log */
break;
@@ -1346,7 +1379,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
syslog_prev = 0;
syslog_partial = 0;
}
- if (from_file) {
+ if (source == SYSLOG_FROM_PROC) {
/*
* Short-cut for poll(/"proc/kmsg") which simply checks
* for pending data, not the size; return the count of
@@ -1393,7 +1426,9 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
*/
-static void call_console_drivers(int level, const char *text, size_t len)
+static void call_console_drivers(int level,
+ const char *ext_text, size_t ext_len,
+ const char *text, size_t len)
{
struct console *con;
@@ -1414,7 +1449,10 @@ static void call_console_drivers(int level, const char *text, size_t len)
if (!cpu_online(smp_processor_id()) &&
!(con->flags & CON_ANYTIME))
continue;
- con->write(con, text, len);
+ if (con->flags & CON_EXTENDED)
+ con->write(con, ext_text, ext_len);
+ else
+ con->write(con, text, len);
}
}
@@ -1557,8 +1595,12 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
if (cont.len && cont.flushed)
return false;
- if (cont.len + len > sizeof(cont.buf)) {
- /* the line gets too long, split it up in separate records */
+ /*
+ * If ext consoles are present, flush and skip in-kernel
+ * continuation. See nr_ext_console_drivers definition. Also, if
+ * the line gets too long, split it up in separate records.
+ */
+ if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) {
cont_flush(LOG_CONT);
return false;
}
@@ -1893,9 +1935,19 @@ static struct cont {
u8 level;
bool flushed:1;
} cont;
+static char *log_text(const struct printk_log *msg) { return NULL; }
+static char *log_dict(const struct printk_log *msg) { return NULL; }
static struct printk_log *log_from_idx(u32 idx) { return NULL; }
static u32 log_next(u32 idx) { return 0; }
-static void call_console_drivers(int level, const char *text, size_t len) {}
+static ssize_t msg_print_ext_header(char *buf, size_t size,
+ struct printk_log *msg, u64 seq,
+ enum log_flags prev_flags) { return 0; }
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *dict, size_t dict_len,
+ char *text, size_t text_len) { return 0; }
+static void call_console_drivers(int level,
+ const char *ext_text, size_t ext_len,
+ const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
@@ -2017,24 +2069,6 @@ int add_preferred_console(char *name, int idx, char *options)
return __add_preferred_console(name, idx, options, NULL);
}
-int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
-{
- struct console_cmdline *c;
- int i;
-
- for (i = 0, c = console_cmdline;
- i < MAX_CMDLINECONSOLES && c->name[0];
- i++, c++)
- if (strcmp(c->name, name) == 0 && c->index == idx) {
- strlcpy(c->name, name_new, sizeof(c->name));
- c->options = options;
- c->index = idx_new;
- return i;
- }
- /* not found */
- return -1;
-}
-
bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);
@@ -2166,7 +2200,7 @@ static void console_cont_flush(char *text, size_t size)
len = cont_print_text(text, size);
raw_spin_unlock(&logbuf_lock);
stop_critical_timings();
- call_console_drivers(cont.level, text, len);
+ call_console_drivers(cont.level, NULL, 0, text, len);
start_critical_timings();
local_irq_restore(flags);
return;
@@ -2190,6 +2224,7 @@ out:
*/
void console_unlock(void)
{
+ static char ext_text[CONSOLE_EXT_LOG_MAX];
static char text[LOG_LINE_MAX + PREFIX_MAX];
static u64 seen_seq;
unsigned long flags;
@@ -2208,6 +2243,7 @@ void console_unlock(void)
again:
for (;;) {
struct printk_log *msg;
+ size_t ext_len = 0;
size_t len;
int level;
@@ -2253,13 +2289,22 @@ skip:
level = msg->level;
len += msg_print_text(msg, console_prev, false,
text + len, sizeof(text) - len);
+ if (nr_ext_console_drivers) {
+ ext_len = msg_print_ext_header(ext_text,
+ sizeof(ext_text),
+ msg, console_seq, console_prev);
+ ext_len += msg_print_ext_body(ext_text + ext_len,
+ sizeof(ext_text) - ext_len,
+ log_dict(msg), msg->dict_len,
+ log_text(msg), msg->text_len);
+ }
console_idx = log_next(console_idx);
console_seq++;
console_prev = msg->flags;
raw_spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(level, text, len);
+ call_console_drivers(level, ext_text, ext_len, text, len);
start_critical_timings();
local_irq_restore(flags);
}
@@ -2436,9 +2481,6 @@ void register_console(struct console *newcon)
if (preferred_console < 0 || bcon || !console_drivers)
preferred_console = selected_console;
- if (newcon->early_setup)
- newcon->early_setup();
-
/*
* See if we want to use this console driver. If we
* didn't select a console we take the first one
@@ -2464,23 +2506,27 @@ void register_console(struct console *newcon)
for (i = 0, c = console_cmdline;
i < MAX_CMDLINECONSOLES && c->name[0];
i++, c++) {
- BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
- if (strcmp(c->name, newcon->name) != 0)
- continue;
- if (newcon->index >= 0 &&
- newcon->index != c->index)
- continue;
- if (newcon->index < 0)
- newcon->index = c->index;
+ if (!newcon->match ||
+ newcon->match(newcon, c->name, c->index, c->options) != 0) {
+ /* default matching */
+ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
+ if (strcmp(c->name, newcon->name) != 0)
+ continue;
+ if (newcon->index >= 0 &&
+ newcon->index != c->index)
+ continue;
+ if (newcon->index < 0)
+ newcon->index = c->index;
- if (_braille_register_console(newcon, c))
- return;
+ if (_braille_register_console(newcon, c))
+ return;
+
+ if (newcon->setup &&
+ newcon->setup(newcon, c->options) != 0)
+ break;
+ }
- if (newcon->setup &&
- newcon->setup(newcon, console_cmdline[i].options) != 0)
- break;
newcon->flags |= CON_ENABLED;
- newcon->index = c->index;
if (i == selected_console) {
newcon->flags |= CON_CONSDEV;
preferred_console = selected_console;
@@ -2514,6 +2560,11 @@ void register_console(struct console *newcon)
newcon->next = console_drivers->next;
console_drivers->next = newcon;
}
+
+ if (newcon->flags & CON_EXTENDED)
+ if (!nr_ext_console_drivers++)
+ pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n");
+
if (newcon->flags & CON_PRINTBUFFER) {
/*
* console_unlock(); will print out the buffered messages
@@ -2586,6 +2637,9 @@ int unregister_console(struct console *console)
}
}
+ if (!res && (console->flags & CON_EXTENDED))
+ nr_ext_console_drivers--;
+
/*
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 227fec36b12a..787320de68e0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -456,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
static int ptrace_detach(struct task_struct *child, unsigned int data)
{
- bool dead = false;
-
if (!valid_signal(data))
return -EIO;
@@ -467,18 +465,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
write_lock_irq(&tasklist_lock);
/*
- * This child can be already killed. Make sure de_thread() or
- * our sub-thread doing do_wait() didn't do release_task() yet.
+ * We rely on ptrace_freeze_traced(). It can't be killed and
+ * untraced by another thread, it can't be a zombie.
*/
- if (child->ptrace) {
- child->exit_code = data;
- dead = __ptrace_detach(current, child);
- }
+ WARN_ON(!child->ptrace || child->exit_state);
+ /*
+ * tasklist_lock avoids the race with wait_task_stopped(), see
+ * the comment in ptrace_resume().
+ */
+ child->exit_code = data;
+ __ptrace_detach(current, child);
write_unlock_irq(&tasklist_lock);
proc_ptrace_connector(child, PTRACE_DETACH);
- if (unlikely(dead))
- release_task(child);
return 0;
}
@@ -557,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
if (data & ~(unsigned long)PTRACE_O_MASK)
return -EINVAL;
+ if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+ if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
+ !config_enabled(CONFIG_SECCOMP))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+ current->ptrace & PT_SUSPEND_SECCOMP)
+ return -EPERM;
+ }
+
/* Avoid intermediate state when all opts are cleared */
flags = child->ptrace;
flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
@@ -697,6 +709,8 @@ static int ptrace_peek_siginfo(struct task_struct *child,
static int ptrace_resume(struct task_struct *child, long request,
unsigned long data)
{
+ bool need_siglock;
+
if (!valid_signal(data))
return -EIO;
@@ -724,8 +738,26 @@ static int ptrace_resume(struct task_struct *child, long request,
user_disable_single_step(child);
}
+ /*
+ * Change ->exit_code and ->state under siglock to avoid the race
+ * with wait_task_stopped() in between; a non-zero ->exit_code will
+ * wrongly look like another report from tracee.
+ *
+ * Note that we need siglock even if ->exit_code == data and/or this
+ * status was not reported yet, the new status must not be cleared by
+ * wait_task_stopped() after resume.
+ *
+ * If data == 0 we do not care if wait_task_stopped() reports the old
+ * status and clears the code too; this can't race with the tracee, it
+ * takes siglock after resume.
+ */
+ need_siglock = data && !thread_group_empty(current);
+ if (need_siglock)
+ spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
wake_up_state(child, __TASK_TRACED);
+ if (need_siglock)
+ spin_unlock_irq(&child->sighand->siglock);
return 0;
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30d42aa55d83..77192953dee5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -241,6 +241,7 @@ rcu_torture_free(struct rcu_torture *p)
struct rcu_torture_ops {
int ttype;
void (*init)(void);
+ void (*cleanup)(void);
int (*readlock)(void);
void (*read_delay)(struct torture_random_state *rrsp);
void (*readunlock)(int idx);
@@ -477,10 +478,12 @@ static struct rcu_torture_ops rcu_busted_ops = {
*/
DEFINE_STATIC_SRCU(srcu_ctl);
+static struct srcu_struct srcu_ctld;
+static struct srcu_struct *srcu_ctlp = &srcu_ctl;
-static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
{
- return srcu_read_lock(&srcu_ctl);
+ return srcu_read_lock(srcu_ctlp);
}
static void srcu_read_delay(struct torture_random_state *rrsp)
@@ -499,49 +502,49 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
rcu_read_delay(rrsp);
}
-static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
{
- srcu_read_unlock(&srcu_ctl, idx);
+ srcu_read_unlock(srcu_ctlp, idx);
}
static unsigned long srcu_torture_completed(void)
{
- return srcu_batches_completed(&srcu_ctl);
+ return srcu_batches_completed(srcu_ctlp);
}
static void srcu_torture_deferred_free(struct rcu_torture *rp)
{
- call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+ call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb);
}
static void srcu_torture_synchronize(void)
{
- synchronize_srcu(&srcu_ctl);
+ synchronize_srcu(srcu_ctlp);
}
static void srcu_torture_call(struct rcu_head *head,
void (*func)(struct rcu_head *head))
{
- call_srcu(&srcu_ctl, head, func);
+ call_srcu(srcu_ctlp, head, func);
}
static void srcu_torture_barrier(void)
{
- srcu_barrier(&srcu_ctl);
+ srcu_barrier(srcu_ctlp);
}
static void srcu_torture_stats(void)
{
int cpu;
- int idx = srcu_ctl.completed & 0x1;
+ int idx = srcu_ctlp->completed & 0x1;
pr_alert("%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
long c0, c1;
- c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
- c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+ c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
+ c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
}
pr_cont("\n");
@@ -549,7 +552,7 @@ static void srcu_torture_stats(void)
static void srcu_torture_synchronize_expedited(void)
{
- synchronize_srcu_expedited(&srcu_ctl);
+ synchronize_srcu_expedited(srcu_ctlp);
}
static struct rcu_torture_ops srcu_ops = {
@@ -569,6 +572,38 @@ static struct rcu_torture_ops srcu_ops = {
.name = "srcu"
};
+static void srcu_torture_init(void)
+{
+ rcu_sync_torture_init();
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ srcu_ctlp = &srcu_ctld;
+}
+
+static void srcu_torture_cleanup(void)
+{
+ cleanup_srcu_struct(&srcu_ctld);
+ srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
+}
+
+/* As above, but dynamically allocated. */
+static struct rcu_torture_ops srcud_ops = {
+ .ttype = SRCU_FLAVOR,
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .started = NULL,
+ .completed = srcu_torture_completed,
+ .deferred_free = srcu_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .exp_sync = srcu_torture_synchronize_expedited,
+ .call = srcu_torture_call,
+ .cb_barrier = srcu_torture_barrier,
+ .stats = srcu_torture_stats,
+ .name = "srcud"
+};
+
/*
* Definitions for sched torture testing.
*/
@@ -600,6 +635,8 @@ static struct rcu_torture_ops sched_ops = {
.deferred_free = rcu_sched_torture_deferred_free,
.sync = synchronize_sched,
.exp_sync = synchronize_sched_expedited,
+ .get_state = get_state_synchronize_sched,
+ .cond_sync = cond_synchronize_sched,
.call = call_rcu_sched,
.cb_barrier = rcu_barrier_sched,
.fqs = rcu_sched_force_quiescent_state,
@@ -649,10 +686,20 @@ static struct rcu_torture_ops tasks_ops = {
#define RCUTORTURE_TASKS_OPS &tasks_ops,
+static bool __maybe_unused torturing_tasks(void)
+{
+ return cur_ops == &tasks_ops;
+}
+
#else /* #ifdef CONFIG_TASKS_RCU */
#define RCUTORTURE_TASKS_OPS
+static bool torturing_tasks(void)
+{
+ return false;
+}
+
#endif /* #else #ifdef CONFIG_TASKS_RCU */
/*
@@ -672,8 +719,8 @@ static void rcu_torture_boost_cb(struct rcu_head *head)
struct rcu_boost_inflight *rbip =
container_of(head, struct rcu_boost_inflight, rcu);
- smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
- rbip->inflight = 0;
+ /* Ensure RCU-core accesses precede clearing ->inflight */
+ smp_store_release(&rbip->inflight, 0);
}
static int rcu_torture_boost(void *arg)
@@ -710,9 +757,9 @@ static int rcu_torture_boost(void *arg)
call_rcu_time = jiffies;
while (ULONG_CMP_LT(jiffies, endtime)) {
/* If we don't have a callback in flight, post one. */
- if (!rbi.inflight) {
- smp_mb(); /* RCU core before ->inflight = 1. */
- rbi.inflight = 1;
+ if (!smp_load_acquire(&rbi.inflight)) {
+ /* RCU core before ->inflight = 1. */
+ smp_store_release(&rbi.inflight, 1);
call_rcu(&rbi.rcu, rcu_torture_boost_cb);
if (jiffies - call_rcu_time >
test_boost_duration * HZ - HZ / 2) {
@@ -751,11 +798,10 @@ checkwait: stutter_wait("rcu_torture_boost");
} while (!torture_must_stop());
/* Clean up and exit. */
- while (!kthread_should_stop() || rbi.inflight) {
+ while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
torture_shutdown_absorb("rcu_torture_boost");
schedule_timeout_uninterruptible(1);
}
- smp_mb(); /* order accesses to ->inflight before stack-frame death. */
destroy_rcu_head_on_stack(&rbi.rcu);
torture_kthread_stopping("rcu_torture_boost");
return 0;
@@ -789,9 +835,7 @@ rcu_torture_cbflood(void *arg)
}
if (err) {
VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
- while (!torture_must_stop())
- schedule_timeout_interruptible(HZ);
- return 0;
+ goto wait_for_stop;
}
VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
do {
@@ -810,6 +854,7 @@ rcu_torture_cbflood(void *arg)
stutter_wait("rcu_torture_cbflood");
} while (!torture_must_stop());
vfree(rhp);
+wait_for_stop:
torture_kthread_stopping("rcu_torture_cbflood");
return 0;
}
@@ -853,6 +898,8 @@ rcu_torture_fqs(void *arg)
static int
rcu_torture_writer(void *arg)
{
+ bool can_expedite = !rcu_gp_is_expedited();
+ int expediting = 0;
unsigned long gp_snap;
bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
bool gp_sync1 = gp_sync;
@@ -865,9 +912,15 @@ rcu_torture_writer(void *arg)
int nsynctypes = 0;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+ pr_alert("%s" TORTURE_FLAG
+ " Grace periods expedited from boot/sysfs for %s,\n",
+ torture_type, cur_ops->name);
+ pr_alert("%s" TORTURE_FLAG
+ " Testing of dynamic grace-period expediting diabled.\n",
+ torture_type);
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
synctype[nsynctypes++] = RTWS_COND_GET;
@@ -949,9 +1002,26 @@ rcu_torture_writer(void *arg)
}
}
rcutorture_record_progress(++rcu_torture_current_version);
+ /* Cycle through nesting levels of rcu_expedite_gp() calls. */
+ if (can_expedite &&
+ !(torture_random(&rand) & 0xff & (!!expediting - 1))) {
+ WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited());
+ if (expediting >= 0)
+ rcu_expedite_gp();
+ else
+ rcu_unexpedite_gp();
+ if (++expediting > 3)
+ expediting = -expediting;
+ }
rcu_torture_writer_state = RTWS_STUTTER;
stutter_wait("rcu_torture_writer");
} while (!torture_must_stop());
+ /* Reset expediting back to unexpedited. */
+ if (expediting > 0)
+ expediting = -expediting;
+ while (can_expedite && expediting++ < 0)
+ rcu_unexpedite_gp();
+ WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
return 0;
@@ -1029,7 +1099,8 @@ static void rcu_torture_timer(unsigned long unused)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(&srcu_ctl));
+ srcu_read_lock_held(srcu_ctlp) ||
+ torturing_tasks());
if (p == NULL) {
/* Leave because rcu_torture_writer is not yet underway */
cur_ops->readunlock(idx);
@@ -1103,7 +1174,8 @@ rcu_torture_reader(void *arg)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(&srcu_ctl));
+ srcu_read_lock_held(srcu_ctlp) ||
+ torturing_tasks());
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
cur_ops->readunlock(idx);
@@ -1388,12 +1460,15 @@ static int rcu_torture_barrier_cbs(void *arg)
do {
wait_event(barrier_cbs_wq[myid],
(newphase =
- ACCESS_ONCE(barrier_phase)) != lastphase ||
+ smp_load_acquire(&barrier_phase)) != lastphase ||
torture_must_stop());
lastphase = newphase;
- smp_mb(); /* ensure barrier_phase load before ->call(). */
if (torture_must_stop())
break;
+ /*
+ * The above smp_load_acquire() ensures barrier_phase load
+ * is ordered before the folloiwng ->call().
+ */
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
@@ -1414,8 +1489,8 @@ static int rcu_torture_barrier(void *arg)
do {
atomic_set(&barrier_cbs_invoked, 0);
atomic_set(&barrier_cbs_count, n_barrier_cbs);
- smp_mb(); /* Ensure barrier_phase after prior assignments. */
- barrier_phase = !barrier_phase;
+ /* Ensure barrier_phase ordered after prior assignments. */
+ smp_store_release(&barrier_phase, !barrier_phase);
for (i = 0; i < n_barrier_cbs; i++)
wake_up(&barrier_cbs_wq[i]);
wait_event(barrier_wq,
@@ -1445,7 +1520,7 @@ static int rcu_torture_barrier_init(void)
int i;
int ret;
- if (n_barrier_cbs == 0)
+ if (n_barrier_cbs <= 0)
return 0;
if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
pr_alert("%s" TORTURE_FLAG
@@ -1563,10 +1638,14 @@ rcu_torture_cleanup(void)
rcutorture_booster_cleanup(i);
}
- /* Wait for all RCU callbacks to fire. */
-
+ /*
+ * Wait for all RCU callbacks to fire, then do flavor-specific
+ * cleanup operations.
+ */
if (cur_ops->cb_barrier != NULL)
cur_ops->cb_barrier();
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
@@ -1643,8 +1722,8 @@ rcu_torture_init(void)
int cpu;
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
- &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
- RCUTORTURE_TASKS_OPS
+ &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
+ &sched_ops, RCUTORTURE_TASKS_OPS
};
if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@@ -1676,7 +1755,7 @@ rcu_torture_init(void)
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
- nrealreaders = num_online_cpus() - 1;
+ nrealreaders = num_online_cpus() - 2 - nreaders;
if (nrealreaders <= 0)
nrealreaders = 1;
}
@@ -1720,12 +1799,15 @@ rcu_torture_init(void)
writer_task);
if (firsterr)
goto unwind;
- fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
- GFP_KERNEL);
- if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
- goto unwind;
+ if (nfakewriters > 0) {
+ fakewriter_tasks = kzalloc(nfakewriters *
+ sizeof(fakewriter_tasks[0]),
+ GFP_KERNEL);
+ if (fakewriter_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
}
for (i = 0; i < nfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
@@ -1752,7 +1834,7 @@ rcu_torture_init(void)
if (firsterr)
goto unwind;
}
- if (test_no_idle_hz) {
+ if (test_no_idle_hz && shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval * HZ);
if (firsterr)
goto unwind;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 445bf8ffe3fb..d3fcb2ec8536 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -151,7 +151,7 @@ static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
unsigned long t;
for_each_possible_cpu(cpu) {
- t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+ t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
sum += t;
}
return sum;
@@ -168,7 +168,7 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
unsigned long t;
for_each_possible_cpu(cpu) {
- t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+ t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
sum += t;
}
return sum;
@@ -252,21 +252,22 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
}
/**
- * srcu_readers_active - returns approximate number of readers.
+ * srcu_readers_active - returns true if there are readers. and false
+ * otherwise
* @sp: which srcu_struct to count active readers (holding srcu_read_lock).
*
* Note that this is not an atomic primitive, and can therefore suffer
* severe errors when invoked on an active srcu_struct. That said, it
* can be useful as an error check at cleanup time.
*/
-static int srcu_readers_active(struct srcu_struct *sp)
+static bool srcu_readers_active(struct srcu_struct *sp)
{
int cpu;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
- sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+ sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+ sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
}
return sum;
}
@@ -296,7 +297,7 @@ int __srcu_read_lock(struct srcu_struct *sp)
{
int idx;
- idx = ACCESS_ONCE(sp->completed) & 0x1;
+ idx = READ_ONCE(sp->completed) & 0x1;
preempt_disable();
__this_cpu_inc(sp->per_cpu_ref->c[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
@@ -402,23 +403,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
}
EXPORT_SYMBOL_GPL(call_srcu);
-struct rcu_synchronize {
- struct rcu_head head;
- struct completion completion;
-};
-
-/*
- * Awaken the corresponding synchronize_srcu() instance now that a
- * grace period has elapsed.
- */
-static void wakeme_after_rcu(struct rcu_head *head)
-{
- struct rcu_synchronize *rcu;
-
- rcu = container_of(head, struct rcu_synchronize, head);
- complete(&rcu->completion);
-}
-
static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
static void srcu_reschedule(struct srcu_struct *sp);
@@ -431,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
struct rcu_head *head = &rcu.head;
bool done = false;
- rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
- !lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
might_sleep();
init_completion(&rcu.completion);
@@ -507,7 +491,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
*/
void synchronize_srcu(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, rcu_expedited
+ __synchronize_srcu(sp, rcu_gp_is_expedited()
? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
: SYNCHRONIZE_SRCU_TRYCOUNT);
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index cc9ceca7bde1..d0471056d0af 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,7 +35,7 @@
#include <linux/time.h>
#include <linux/cpu.h>
#include <linux/prefetch.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include "rcu.h"
@@ -49,39 +49,6 @@ static void __call_rcu(struct rcu_head *head,
#include "tiny_plugin.h"
-/*
- * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode.
- */
-void rcu_idle_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-/*
- * Exit an interrupt handler towards idle.
- */
-void rcu_irq_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_exit);
-
-/*
- * Exit idle, so that we are no longer in an extended quiescent state.
- */
-void rcu_idle_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
-/*
- * Enter an interrupt handler, moving away from idle.
- */
-void rcu_irq_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_enter);
-
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
/*
@@ -103,8 +70,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
RCU_TRACE(reset_cpu_stall_ticks(rcp));
- if (rcp->rcucblist != NULL &&
- rcp->donetail != rcp->curtail) {
+ if (rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
return 1;
}
@@ -169,19 +135,13 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
unsigned long flags;
RCU_TRACE(int cb_count = 0);
- /* If no RCU callbacks ready to invoke, just return. */
- if (&rcp->rcucblist == rcp->donetail) {
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
- RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
- !!ACCESS_ONCE(rcp->rcucblist),
- need_resched(),
- is_idle_task(current),
- false));
- return;
- }
-
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
+ if (rcp->donetail == &rcp->rcucblist) {
+ /* No callbacks ready, so just leave. */
+ local_irq_restore(flags);
+ return;
+ }
RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
list = rcp->rcucblist;
rcp->rcucblist = *rcp->donetail;
@@ -231,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*/
void synchronize_sched(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_sched() in RCU read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU read-side critical section");
cond_resched();
}
EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index f94e209a10d6..e492a5253e0f 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,16 +144,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
return;
rcp->ticks_this_gp++;
j = jiffies;
- js = ACCESS_ONCE(rcp->jiffies_stall);
+ js = READ_ONCE(rcp->jiffies_stall);
if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
jiffies - rcp->gp_start, rcp->qlen);
dump_stack();
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
- 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
} else if (ULONG_CMP_GE(j, js)) {
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+ WRITE_ONCE(rcp->jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
}
}
@@ -161,7 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
{
rcp->ticks_this_gp = 0;
rcp->gp_start = jiffies;
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+ WRITE_ONCE(rcp->jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
}
static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..9f75f25cc5d9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -54,7 +54,7 @@
#include <linux/delay.h>
#include <linux/stop_machine.h>
#include <linux/random.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/suspend.h>
#include "tree.h"
@@ -70,6 +70,8 @@ MODULE_ALIAS("rcutree");
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
/*
* In order to export the rcu_state name to the tracing tools, it
@@ -91,8 +93,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
DEFINE_RCU_TPS(sname) \
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
struct rcu_state sname##_state = { \
.level = { &sname##_state.node[0] }, \
+ .rda = &sname##_data, \
.call = cr, \
.fqs_state = RCU_GP_IDLE, \
.gpnum = 0UL - 300UL, \
@@ -101,29 +105,29 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
- .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
-}; \
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
+}
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
-static struct rcu_state *rcu_state_p;
+static struct rcu_state *const rcu_state_p;
+static struct rcu_data __percpu *const rcu_data_p;
LIST_HEAD(rcu_struct_flavors);
-/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
-static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+/* Dump rcu_node combining tree at boot to verify correct setup. */
+static bool dump_tree;
+module_param(dump_tree, bool, 0444);
+/* Control rcu_node-tree auto-balancing at boot time. */
+static bool rcu_fanout_exact;
+module_param(rcu_fanout_exact, bool, 0444);
+/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
-static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
- NUM_RCU_LVL_0,
- NUM_RCU_LVL_1,
- NUM_RCU_LVL_2,
- NUM_RCU_LVL_3,
- NUM_RCU_LVL_4,
-};
+/* Number of rcu_nodes at specified level. */
+static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/*
@@ -152,14 +156,54 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
*/
static int rcu_scheduler_fully_active __read_mostly;
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
/* rcuc/rcub kthread realtime priority */
+#ifdef CONFIG_RCU_KTHREAD_PRIO
static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
+static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
+#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
module_param(kthread_prio, int, 0644);
+/* Delay in jiffies for grace-period initialization delays, debug only. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT
+static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY;
+module_param(gp_preinit_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+static const int gp_preinit_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
+static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
+module_param(gp_init_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
+static const int gp_init_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
+static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
+module_param(gp_cleanup_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+static const int gp_cleanup_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+
+/*
+ * Number of grace periods between delays, normalized by the duration of
+ * the delay. The longer the the delay, the more the grace periods between
+ * each delay. The reason for this normalization is that it means that,
+ * for non-zero delays, the overall slowdown of grace periods is constant
+ * regardless of the duration of the delay. This arrangement balances
+ * the need for long delays to increase some race probabilities with the
+ * need for fast grace periods to increase other race probabilities.
+ */
+#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
+
/*
* Track the rcutorture test sequence number and the update version
* number within a given test. The rcutorture_testseq is incremented
@@ -173,13 +217,24 @@ unsigned long rcutorture_testseq;
unsigned long rcutorture_vernum;
/*
- * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+ return READ_ONCE(rnp->qsmaskinitnext);
+}
+
+/*
+ * Return true if an RCU grace period is in progress. The READ_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
*/
static int rcu_gp_in_progress(struct rcu_state *rsp)
{
- return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+ return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum);
}
/*
@@ -256,8 +311,8 @@ static void rcu_momentary_dyntick_idle(void)
if (!(resched_mask & rsp->flavor_mask))
continue;
smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
- if (ACCESS_ONCE(rdp->mynode->completed) !=
- ACCESS_ONCE(rdp->cond_resched_completed))
+ if (READ_ONCE(rdp->mynode->completed) !=
+ READ_ONCE(rdp->cond_resched_completed))
continue;
/*
@@ -292,10 +347,10 @@ void rcu_note_context_switch(void)
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
/*
- * Register a quiesecent state for all RCU flavors. If there is an
+ * Register a quiescent state for all RCU flavors. If there is an
* emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
* dyntick-idle quiescent state visible to other CPUs (but only for those
- * RCU flavors in desparate need of a quiescent state, which will normally
+ * RCU flavors in desperate need of a quiescent state, which will normally
* be none of them). Either way, do a lightweight quiescent state for
* all RCU flavors.
*/
@@ -410,6 +465,15 @@ void rcu_bh_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+ force_quiescent_state(&rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+
+/*
* Show the state of the grace-period kthreads.
*/
void show_rcu_gp_kthreads(void)
@@ -460,9 +524,9 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
break;
}
if (rsp != NULL) {
- *flags = ACCESS_ONCE(rsp->gp_flags);
- *gpnum = ACCESS_ONCE(rsp->gpnum);
- *completed = ACCESS_ONCE(rsp->completed);
+ *flags = READ_ONCE(rsp->gp_flags);
+ *gpnum = READ_ONCE(rsp->gpnum);
+ *completed = READ_ONCE(rsp->completed);
return;
}
*flags = 0;
@@ -483,15 +547,6 @@ void rcutorture_record_progress(unsigned long vernum)
EXPORT_SYMBOL_GPL(rcutorture_record_progress);
/*
- * Force a quiescent state for RCU-sched.
- */
-void rcu_sched_force_quiescent_state(void)
-{
- force_quiescent_state(&rcu_sched_state);
-}
-EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
-
-/*
* Does the CPU have callbacks ready to be invoked?
*/
static int
@@ -517,10 +572,10 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
static int rcu_future_needs_gp(struct rcu_state *rsp)
{
struct rcu_node *rnp = rcu_get_root(rsp);
- int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+ int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
int *fp = &rnp->need_future_gp[idx];
- return ACCESS_ONCE(*fp);
+ return READ_ONCE(*fp);
}
/*
@@ -543,7 +598,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
return 1; /* Yes, this CPU has newly registered callbacks. */
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
- ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+ ULONG_CMP_LT(READ_ONCE(rsp->completed),
rdp->nxtcompleted[i]))
return 1; /* Yes, CBs for future grace period. */
return 0; /* No grace period needed. */
@@ -563,7 +618,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
- if (!user && !is_idle_task(current)) {
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
idle_task(smp_processor_id());
@@ -582,19 +638,20 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
smp_mb__after_atomic(); /* Force ordering with next sojourn. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ atomic_read(&rdtp->dynticks) & 0x1);
rcu_dynticks_task_enter();
/*
* It is illegal to enter an extended quiescent state while
* in an RCU read-side critical section.
*/
- rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
- "Illegal idle entry in RCU read-side critical section.");
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
- "Illegal idle entry in RCU-bh read-side critical section.");
- rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
- "Illegal idle entry in RCU-sched read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
+ "Illegal idle entry in RCU read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
+ "Illegal idle entry in RCU-bh read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
+ "Illegal idle entry in RCU-sched read-side critical section.");
}
/*
@@ -608,7 +665,8 @@ static void rcu_eqs_enter(bool user)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (oldval & DYNTICK_TASK_NEST_MASK) == 0);
if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_nesting = 0;
rcu_eqs_enter_common(oldval, user);
@@ -640,7 +698,7 @@ void rcu_idle_enter(void)
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
/**
* rcu_user_enter - inform RCU that we are resuming userspace.
*
@@ -653,7 +711,7 @@ void rcu_user_enter(void)
{
rcu_eqs_enter(1);
}
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
/**
* rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -681,7 +739,8 @@ void rcu_irq_exit(void)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting--;
- WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ rdtp->dynticks_nesting < 0);
if (rdtp->dynticks_nesting)
trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
else
@@ -706,10 +765,12 @@ static void rcu_eqs_exit_common(long long oldval, int user)
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !(atomic_read(&rdtp->dynticks) & 0x1));
rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
- if (!user && !is_idle_task(current)) {
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
idle_task(smp_processor_id());
@@ -733,7 +794,7 @@ static void rcu_eqs_exit(bool user)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE(oldval < 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
if (oldval & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
@@ -764,7 +825,7 @@ void rcu_idle_exit(void)
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
/**
* rcu_user_exit - inform RCU that we are exiting userspace.
*
@@ -775,7 +836,7 @@ void rcu_user_exit(void)
{
rcu_eqs_exit(1);
}
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
/**
* rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -806,7 +867,8 @@ void rcu_irq_enter(void)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
- WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ rdtp->dynticks_nesting == 0);
if (oldval)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
@@ -913,9 +975,9 @@ bool notrace rcu_is_watching(void)
{
bool ret;
- preempt_disable();
+ preempt_disable_notrace();
ret = __rcu_is_watching();
- preempt_enable();
+ preempt_enable_notrace();
return ret;
}
EXPORT_SYMBOL_GPL(rcu_is_watching);
@@ -954,7 +1016,7 @@ bool rcu_lockdep_current_cpu_online(void)
preempt_disable();
rdp = this_cpu_ptr(&rcu_sched_data);
rnp = rdp->mynode;
- ret = (rdp->grpmask & rnp->qsmaskinit) ||
+ ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
!rcu_scheduler_fully_active;
preempt_enable();
return ret;
@@ -989,9 +1051,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
return 1;
} else {
- if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+ if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
rdp->mynode->gpnum))
- ACCESS_ONCE(rdp->gpwrap) = true;
+ WRITE_ONCE(rdp->gpwrap, true);
return 0;
}
}
@@ -1071,12 +1133,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
if (ULONG_CMP_GE(jiffies,
rdp->rsp->gp_start + jiffies_till_sched_qs) ||
ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
- if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
- ACCESS_ONCE(rdp->cond_resched_completed) =
- ACCESS_ONCE(rdp->mynode->completed);
+ if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+ WRITE_ONCE(rdp->cond_resched_completed,
+ READ_ONCE(rdp->mynode->completed));
smp_mb(); /* ->cond_resched_completed before *rcrmp. */
- ACCESS_ONCE(*rcrmp) =
- ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+ WRITE_ONCE(*rcrmp,
+ READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
rdp->rsp->jiffies_resched += 5; /* Enable beating. */
} else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
@@ -1097,9 +1159,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
rsp->gp_start = j;
smp_wmb(); /* Record start time before stall time. */
j1 = rcu_jiffies_till_stall_check();
- ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
+ WRITE_ONCE(rsp->jiffies_stall, j + j1);
rsp->jiffies_resched = j + j1 / 2;
- rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+ rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs);
}
/*
@@ -1111,10 +1173,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
unsigned long j;
j = jiffies;
- gpa = ACCESS_ONCE(rsp->gp_activity);
+ gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies!\n",
- rsp->name, j - gpa);
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
+ rsp->name, j - gpa,
+ rsp->gpnum, rsp->completed,
+ rsp->gp_flags, rsp->gp_state,
+ rsp->gp_kthread ? rsp->gp_kthread->state : 0);
}
/*
@@ -1151,12 +1216,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
/* Only let one CPU complain about others per time interval. */
raw_spin_lock_irqsave(&rnp->lock, flags);
- delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+ delta = jiffies - READ_ONCE(rsp->jiffies_stall);
if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rsp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -1190,15 +1256,16 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
if (ndetected) {
rcu_dump_cpu_stacks(rsp);
} else {
- if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
- ACCESS_ONCE(rsp->completed) == gpnum) {
+ if (READ_ONCE(rsp->gpnum) != gpnum ||
+ READ_ONCE(rsp->completed) == gpnum) {
pr_err("INFO: Stall ended before state dump start\n");
} else {
j = jiffies;
- gpa = ACCESS_ONCE(rsp->gp_activity);
- pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+ gpa = READ_ONCE(rsp->gp_activity);
+ pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
rsp->name, j - gpa, j, gpa,
- jiffies_till_next_fqs);
+ jiffies_till_next_fqs,
+ rcu_get_root(rsp)->qsmask);
/* In this case, the current CPU might be at fault. */
sched_show_task(current);
}
@@ -1239,9 +1306,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
- if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
- 3 * rcu_jiffies_till_stall_check() + 3;
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
+ WRITE_ONCE(rsp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -1284,20 +1351,20 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
* Given this check, comparisons of jiffies, rsp->jiffies_stall,
* and rsp->gp_start suffice to forestall false positives.
*/
- gpnum = ACCESS_ONCE(rsp->gpnum);
+ gpnum = READ_ONCE(rsp->gpnum);
smp_rmb(); /* Pick up ->gpnum first... */
- js = ACCESS_ONCE(rsp->jiffies_stall);
+ js = READ_ONCE(rsp->jiffies_stall);
smp_rmb(); /* ...then ->jiffies_stall before the rest... */
- gps = ACCESS_ONCE(rsp->gp_start);
+ gps = READ_ONCE(rsp->gp_start);
smp_rmb(); /* ...and finally ->gp_start before ->completed. */
- completed = ACCESS_ONCE(rsp->completed);
+ completed = READ_ONCE(rsp->completed);
if (ULONG_CMP_GE(completed, gpnum) ||
ULONG_CMP_LT(j, js) ||
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
if (rcu_gp_in_progress(rsp) &&
- (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+ (READ_ONCE(rnp->qsmask) & rdp->grpmask)) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
@@ -1324,24 +1391,34 @@ void rcu_cpu_stall_reset(void)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
+ WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2);
}
/*
- * Initialize the specified rcu_data structure's callback list to empty.
+ * Initialize the specified rcu_data structure's default callback list
+ * to empty. The default callback list is the one that is not used by
+ * no-callbacks CPUs.
*/
-static void init_callback_list(struct rcu_data *rdp)
+static void init_default_callback_list(struct rcu_data *rdp)
{
int i;
- if (init_nocb_callback_list(rdp))
- return;
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
}
/*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+ if (init_nocb_callback_list(rdp))
+ return;
+ init_default_callback_list(rdp);
+}
+
+/*
* Determine the value that ->completed will have at the end of the
* next subsequent grace period. This is used to tag callbacks so that
* a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1424,7 +1501,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* doing some extra useless work.
*/
if (rnp->gpnum != rnp->completed ||
- ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
+ READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
goto out;
@@ -1509,7 +1586,7 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
static void rcu_gp_kthread_wake(struct rcu_state *rsp)
{
if (current == rsp->gp_kthread ||
- !ACCESS_ONCE(rsp->gp_flags) ||
+ !READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
wake_up(&rsp->gp_wq);
@@ -1644,7 +1721,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
- !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+ !unlikely(READ_ONCE(rdp->gpwrap))) {
/* No grace period end, so just accelerate recent callbacks. */
ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1659,7 +1736,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
}
- if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+ if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) {
/*
* If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
@@ -1671,7 +1748,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
- ACCESS_ONCE(rdp->gpwrap) = false;
+ WRITE_ONCE(rdp->gpwrap, false);
}
return ret;
}
@@ -1684,9 +1761,9 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
local_irq_save(flags);
rnp = rdp->mynode;
- if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
- rdp->completed == ACCESS_ONCE(rnp->completed) &&
- !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
+ if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
+ rdp->completed == READ_ONCE(rnp->completed) &&
+ !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
!raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
local_irq_restore(flags);
return;
@@ -1698,24 +1775,31 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
rcu_gp_kthread_wake(rsp);
}
+static void rcu_gp_slow(struct rcu_state *rsp, int delay)
+{
+ if (delay > 0 &&
+ !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+ schedule_timeout_uninterruptible(delay);
+}
+
/*
* Initialize a new grace period. Return 0 if no grace period required.
*/
static int rcu_gp_init(struct rcu_state *rsp)
{
+ unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
- rcu_bind_gp_kthread();
+ WRITE_ONCE(rsp->gp_activity, jiffies);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- if (!ACCESS_ONCE(rsp->gp_flags)) {
+ if (!READ_ONCE(rsp->gp_flags)) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq(&rnp->lock);
return 0;
}
- ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
+ WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
/*
@@ -1733,9 +1817,55 @@ static int rcu_gp_init(struct rcu_state *rsp)
trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
raw_spin_unlock_irq(&rnp->lock);
- /* Exclude any concurrent CPU-hotplug operations. */
- mutex_lock(&rsp->onoff_mutex);
- smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
+ /*
+ * Apply per-leaf buffered online and offline operations to the
+ * rcu_node tree. Note that this new grace period need not wait
+ * for subsequent online CPUs, and that quiescent-state forcing
+ * will handle subsequent offline CPUs.
+ */
+ rcu_for_each_leaf_node(rsp, rnp) {
+ rcu_gp_slow(rsp, gp_preinit_delay);
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
+ !rnp->wait_blkd_tasks) {
+ /* Nothing to do on this leaf rcu_node structure. */
+ raw_spin_unlock_irq(&rnp->lock);
+ continue;
+ }
+
+ /* Record old state, apply changes to ->qsmaskinit field. */
+ oldmask = rnp->qsmaskinit;
+ rnp->qsmaskinit = rnp->qsmaskinitnext;
+
+ /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
+ if (!oldmask != !rnp->qsmaskinit) {
+ if (!oldmask) /* First online CPU for this rcu_node. */
+ rcu_init_new_rnp(rnp);
+ else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
+ rnp->wait_blkd_tasks = true;
+ else /* Last offline CPU and can propagate. */
+ rcu_cleanup_dead_rnp(rnp);
+ }
+
+ /*
+ * If all waited-on tasks from prior grace period are
+ * done, and if all this rcu_node structure's CPUs are
+ * still offline, propagate up the rcu_node tree and
+ * clear ->wait_blkd_tasks. Otherwise, if one of this
+ * rcu_node structure's CPUs has since come back online,
+ * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
+ * checks for this, so just call it unconditionally).
+ */
+ if (rnp->wait_blkd_tasks &&
+ (!rcu_preempt_has_tasks(rnp) ||
+ rnp->qsmaskinit)) {
+ rnp->wait_blkd_tasks = false;
+ rcu_cleanup_dead_rnp(rnp);
+ }
+
+ raw_spin_unlock_irq(&rnp->lock);
+ }
/*
* Set the quiescent-state-needed bits in all the rcu_node
@@ -1751,14 +1881,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
* process finishes, because this kthread handles both.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
+ rcu_gp_slow(rsp, gp_init_delay);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
- ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
- WARN_ON_ONCE(rnp->completed != rsp->completed);
- ACCESS_ONCE(rnp->completed) = rsp->completed;
+ WRITE_ONCE(rnp->gpnum, rsp->gpnum);
+ if (WARN_ON_ONCE(rnp->completed != rsp->completed))
+ WRITE_ONCE(rnp->completed, rsp->completed);
if (rnp == rdp->mynode)
(void)__note_gp_changes(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
@@ -1767,14 +1898,33 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
}
- mutex_unlock(&rsp->onoff_mutex);
return 1;
}
/*
+ * Helper function for wait_event_interruptible_timeout() wakeup
+ * at force-quiescent-state time.
+ */
+static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Someone like call_rcu() requested a force-quiescent-state scan. */
+ *gfp = READ_ONCE(rsp->gp_flags);
+ if (*gfp & RCU_GP_FLAG_FQS)
+ return true;
+
+ /* The current grace period has completed. */
+ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+ return true;
+
+ return false;
+}
+
+/*
* Do one round of quiescent-state forcing.
*/
static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
@@ -1784,7 +1934,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
@@ -1798,15 +1948,15 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
- isidle = false;
+ isidle = true;
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
}
/* Clear flag to prevent immediate re-entry. */
- if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- ACCESS_ONCE(rsp->gp_flags) =
- ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
+ WRITE_ONCE(rsp->gp_flags,
+ READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq(&rnp->lock);
}
return fqs_state;
@@ -1823,7 +1973,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
gp_duration = jiffies - rsp->gp_start;
@@ -1852,7 +2002,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
rcu_for_each_node_breadth_first(rsp, rnp) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+ WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+ WARN_ON_ONCE(rnp->qsmask);
+ WRITE_ONCE(rnp->completed, rsp->gpnum);
rdp = this_cpu_ptr(rsp->rda);
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
@@ -1860,7 +2012,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
+ rcu_gp_slow(rsp, gp_cleanup_delay);
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
@@ -1868,16 +2021,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
rcu_nocb_gp_set(rnp, nocb);
/* Declare grace period done. */
- ACCESS_ONCE(rsp->completed) = rsp->gpnum;
+ WRITE_ONCE(rsp->completed, rsp->gpnum);
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
/* Advance CBs to reduce false positives below. */
needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
if (needgp || cpu_needs_another_gp(rsp, rdp)) {
- ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+ WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("newreq"));
}
raw_spin_unlock_irq(&rnp->lock);
@@ -1895,25 +2048,27 @@ static int __noreturn rcu_gp_kthread(void *arg)
struct rcu_state *rsp = arg;
struct rcu_node *rnp = rcu_get_root(rsp);
+ rcu_bind_gp_kthread();
for (;;) {
/* Handle grace-period start. */
for (;;) {
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
wait_event_interruptible(rsp->gp_wq,
- ACCESS_ONCE(rsp->gp_flags) &
+ READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
+ rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("reqwaitsig"));
}
@@ -1929,39 +2084,36 @@ static int __noreturn rcu_gp_kthread(void *arg)
if (!ret)
rsp->jiffies_force_qs = jiffies + j;
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
- ((gf = ACCESS_ONCE(rsp->gp_flags)) &
- RCU_GP_FLAG_FQS) ||
- (!ACCESS_ONCE(rnp->qsmask) &&
- !rcu_preempt_blocked_readers_cgp(rnp)),
- j);
+ rcu_gp_fqs_check_wake(rsp, &gf), j);
+ rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
- if (!ACCESS_ONCE(rnp->qsmask) &&
+ if (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp))
break;
/* If time for quiescent-state forcing, do it. */
if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
(gf & RCU_GP_FLAG_FQS)) {
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqsstart"));
fqs_state = rcu_gp_fqs(rsp, fqs_state);
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqsend"));
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
} else {
/* Deal with stray signal. */
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqswaitsig"));
}
j = jiffies_till_next_fqs;
@@ -1975,7 +2127,9 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
/* Handle grace-period end. */
+ rsp->gp_state = RCU_GP_CLEANUP;
rcu_gp_cleanup(rsp);
+ rsp->gp_state = RCU_GP_CLEANED;
}
}
@@ -2003,8 +2157,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
*/
return false;
}
- ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
- trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+ WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
+ trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
TPS("newreq"));
/*
@@ -2054,6 +2208,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+ WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
rcu_gp_kthread_wake(rsp);
}
@@ -2062,25 +2217,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
* Similar to rcu_report_qs_rdp(), for which it is a helper function.
* Allows quiescent states for a group of CPUs to be reported at one go
* to the specified rcu_node structure, though all the CPUs in the group
- * must be represented by the same rcu_node structure (which need not be
- * a leaf rcu_node structure, though it often will be). That structure's
- * lock must be held upon entry, and it is released before return.
+ * must be represented by the same rcu_node structure (which need not be a
+ * leaf rcu_node structure, though it often will be). The gps parameter
+ * is the grace-period snapshot, which means that the quiescent states
+ * are valid only if rnp->gpnum is equal to gps. That structure's lock
+ * must be held upon entry, and it is released before return.
*/
static void
rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
- struct rcu_node *rnp, unsigned long flags)
+ struct rcu_node *rnp, unsigned long gps, unsigned long flags)
__releases(rnp->lock)
{
+ unsigned long oldmask = 0;
struct rcu_node *rnp_c;
/* Walk up the rcu_node hierarchy. */
for (;;) {
- if (!(rnp->qsmask & mask)) {
+ if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
- /* Our bit has already been cleared, so done. */
+ /*
+ * Our bit has already been cleared, or the
+ * relevant grace period is already over, so done.
+ */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
+ WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
mask, rnp->qsmask, rnp->level,
@@ -2104,7 +2266,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
rnp = rnp->parent;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- WARN_ON_ONCE(rnp_c->qsmask);
+ oldmask = rnp_c->qsmask;
}
/*
@@ -2116,6 +2278,46 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
}
/*
+ * Record a quiescent state for all tasks that were previously queued
+ * on the specified rcu_node structure and that were blocking the current
+ * RCU grace period. The caller must hold the specified rnp->lock with
+ * irqs disabled, and this lock is released upon return, but irqs remain
+ * disabled.
+ */
+static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
+ struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ unsigned long gps;
+ unsigned long mask;
+ struct rcu_node *rnp_p;
+
+ if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
+ rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return; /* Still need more quiescent states! */
+ }
+
+ rnp_p = rnp->parent;
+ if (rnp_p == NULL) {
+ /*
+ * Only one rcu_node structure in the tree, so don't
+ * try to report up to its nonexistent parent!
+ */
+ rcu_report_qs_rsp(rsp, flags);
+ return;
+ }
+
+ /* Report up the rest of the hierarchy, tracking current ->gpnum. */
+ gps = rnp->gpnum;
+ mask = rnp->grpmask;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
+}
+
+/*
* Record a quiescent state for the specified CPU to that CPU's rcu_data
* structure. This must be either called from the specified CPU, or
* called when the specified CPU is known to be offline (and when it is
@@ -2163,7 +2365,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
*/
needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
- rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+ /* ^^^ Released rnp->lock */
if (needwake)
rcu_gp_kthread_wake(rsp);
}
@@ -2203,8 +2406,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Send the specified CPU's RCU callbacks to the orphanage. The
* specified CPU must be offline, and the caller must hold the
@@ -2215,7 +2416,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
/* No-CBs CPUs do not have orphanable callbacks. */
- if (rcu_is_nocb_cpu(rdp->cpu))
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
return;
/*
@@ -2228,7 +2429,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
rsp->qlen += rdp->qlen;
rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen_lazy = 0;
- ACCESS_ONCE(rdp->qlen) = 0;
+ WRITE_ONCE(rdp->qlen, 0);
}
/*
@@ -2256,8 +2457,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
}
- /* Finally, initialize the rcu_data structure's list to empty. */
+ /*
+ * Finally, initialize the rcu_data structure's list to empty and
+ * disallow further callbacks on this CPU.
+ */
init_callback_list(rdp);
+ rdp->nxttail[RCU_NEXT_TAIL] = NULL;
}
/*
@@ -2270,7 +2475,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
/* No-CBs CPUs are handled specially. */
- if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
return;
/* Do the accounting first. */
@@ -2317,6 +2523,9 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return;
+
RCU_TRACE(mask = rdp->grpmask);
trace_rcu_grace_period(rsp->name,
rnp->gpnum + 1 - !!(rnp->qsmask & mask),
@@ -2345,7 +2554,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
- if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
return;
for (;;) {
mask = rnp->grpmask;
@@ -2355,6 +2565,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
smp_mb__after_unlock_lock(); /* GP memory ordering. */
rnp->qsmaskinit &= ~mask;
+ rnp->qsmask &= ~mask;
if (rnp->qsmaskinit) {
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
return;
@@ -2364,6 +2575,29 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
}
/*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function. We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return;
+
+ /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+ mask = rdp->grpmask;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
+ rnp->qsmaskinitnext &= ~mask;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
* The CPU has been completely removed, and some other CPU is reporting
* this fact from process context. Do the remainder of the cleanup,
* including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2376,50 +2610,23 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return;
+
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
- /* Exclude any attempts to start a new grace period. */
- mutex_lock(&rsp->onoff_mutex);
- raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
-
/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+ raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
rcu_adopt_orphan_cbs(rsp, flags);
raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
- /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
- rnp->qsmaskinit &= ~rdp->grpmask;
- if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
- rcu_cleanup_dead_rnp(rnp);
- rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
cpu, rdp->qlen, rdp->nxtlist);
- init_callback_list(rdp);
- /* Disallow further callbacks on this CPU. */
- rdp->nxttail[RCU_NEXT_TAIL] = NULL;
- mutex_unlock(&rsp->onoff_mutex);
-}
-
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
-{
-}
-
-static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
-}
-
-static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
-{
}
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Thottle as specified by rdp->blimit.
@@ -2434,7 +2641,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* If no callbacks are ready, just return. */
if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
- trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+ trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist),
need_resched(), is_idle_task(current),
rcu_is_callbacks_kthread());
return;
@@ -2490,7 +2697,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
}
smp_mb(); /* List handling before counting for rcu_barrier(). */
rdp->qlen_lazy -= count_lazy;
- ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
+ WRITE_ONCE(rdp->qlen, rdp->qlen - count);
rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
@@ -2584,31 +2791,46 @@ static void force_qs_rnp(struct rcu_state *rsp,
mask = 0;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- if (!rcu_gp_in_progress(rsp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
if (rnp->qsmask == 0) {
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
- continue;
+ if (rcu_state_p == &rcu_sched_state ||
+ rsp != rcu_state_p ||
+ rcu_preempt_blocked_readers_cgp(rnp)) {
+ /*
+ * No point in scanning bits because they
+ * are all zero. But we might need to
+ * priority-boost blocked readers.
+ */
+ rcu_initiate_boost(rnp, flags);
+ /* rcu_initiate_boost() releases rnp->lock */
+ continue;
+ }
+ if (rnp->parent &&
+ (rnp->parent->qsmask & rnp->grpmask)) {
+ /*
+ * Race between grace-period
+ * initialization and task exiting RCU
+ * read-side critical section: Report.
+ */
+ rcu_report_unblock_qs_rnp(rsp, rnp, flags);
+ /* rcu_report_unblock_qs_rnp() rlses ->lock */
+ continue;
+ }
}
cpu = rnp->grplo;
bit = 1;
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
if ((rnp->qsmask & bit) != 0) {
- if ((rnp->qsmaskinit & bit) != 0)
- *isidle = false;
if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
mask |= bit;
}
}
if (mask != 0) {
-
- /* rcu_report_qs_rnp() releases rnp->lock. */
- rcu_report_qs_rnp(mask, rsp, rnp, flags);
- continue;
+ /* Idle/offline CPUs, report (releases rnp->lock. */
+ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+ } else {
+ /* Nothing to do here, so just drop the lock. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
}
@@ -2626,7 +2848,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
/* Funnel through hierarchy to reduce memory contention. */
rnp = __this_cpu_read(rsp->rda->mynode);
for (; rnp != NULL; rnp = rnp->parent) {
- ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+ ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
@@ -2642,13 +2864,12 @@ static void force_quiescent_state(struct rcu_state *rsp)
raw_spin_lock_irqsave(&rnp_old->lock, flags);
smp_mb__after_unlock_lock();
raw_spin_unlock(&rnp_old->fqslock);
- if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
rsp->n_force_qs_lh++;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
- ACCESS_ONCE(rsp->gp_flags) =
- ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
+ WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
rcu_gp_kthread_wake(rsp);
}
@@ -2714,7 +2935,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*/
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
{
- if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+ if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
return;
if (likely(!rsp->boost)) {
rcu_do_batch(rsp, rdp);
@@ -2741,7 +2962,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
*/
- if (!rcu_is_watching() && cpu_online(smp_processor_id()))
+ if (!rcu_is_watching())
invoke_rcu_core();
/* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2805,7 +3026,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
if (debug_rcu_head_queue(head)) {
/* Probable double call_rcu(), so leak the callback. */
- ACCESS_ONCE(head->func) = rcu_leak_callback;
+ WRITE_ONCE(head->func, rcu_leak_callback);
WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
return;
}
@@ -2827,13 +3048,24 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
if (cpu != -1)
rdp = per_cpu_ptr(rsp->rda, cpu);
- offline = !__call_rcu_nocb(rdp, head, lazy, flags);
- WARN_ON_ONCE(offline);
- /* _call_rcu() is illegal on offline CPU; leak the callback. */
- local_irq_restore(flags);
- return;
+ if (likely(rdp->mynode)) {
+ /* Post-boot, so this should be for a no-CBs CPU. */
+ offline = !__call_rcu_nocb(rdp, head, lazy, flags);
+ WARN_ON_ONCE(offline);
+ /* Offline CPU, _call_rcu() illegal, leak callback. */
+ local_irq_restore(flags);
+ return;
+ }
+ /*
+ * Very early boot, before rcu_init(). Initialize if needed
+ * and then drop through to queue the callback.
+ */
+ BUG_ON(cpu != -1);
+ WARN_ON_ONCE(!rcu_is_watching());
+ if (!likely(rdp->nxtlist))
+ init_default_callback_list(rdp);
}
- ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
+ WRITE_ONCE(rdp->qlen, rdp->qlen + 1);
if (lazy)
rdp->qlen_lazy++;
else
@@ -2948,13 +3180,13 @@ static inline int rcu_blocking_is_gp(void)
*/
void synchronize_sched(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_sched() in RCU-sched read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU-sched read-side critical section");
if (rcu_blocking_is_gp())
return;
- if (rcu_expedited)
+ if (rcu_gp_is_expedited())
synchronize_sched_expedited();
else
wait_rcu_gp(call_rcu_sched);
@@ -2975,13 +3207,13 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
*/
void synchronize_rcu_bh(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
if (rcu_blocking_is_gp())
return;
- if (rcu_expedited)
+ if (rcu_gp_is_expedited())
synchronize_rcu_bh_expedited();
else
wait_rcu_gp(call_rcu_bh);
@@ -3040,23 +3272,247 @@ void cond_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
-static int synchronize_sched_expedited_cpu_stop(void *data)
+/**
+ * get_state_synchronize_sched - Snapshot current RCU-sched state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_sched()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_sched(void)
{
/*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
+ * Any prior manipulation of RCU-protected data must happen
+ * before the load from ->gpnum.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Make sure this load happens before the purportedly
+ * time-consuming work between get_state_synchronize_sched()
+ * and cond_synchronize_sched().
+ */
+ return smp_load_acquire(&rcu_sched_state.gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_sched);
+
+/**
+ * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_sched()
+ *
+ * If a full RCU-sched grace period has elapsed since the earlier call to
+ * get_state_synchronize_sched(), just return. Otherwise, invoke
+ * synchronize_sched() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account. But
+ * counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_sched(unsigned long oldstate)
+{
+ unsigned long newstate;
+
+ /*
+ * Ensure that this load happens before any RCU-destructive
+ * actions the caller might carry out after we return.
+ */
+ newstate = smp_load_acquire(&rcu_sched_state.completed);
+ if (ULONG_CMP_GE(oldstate, newstate))
+ synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_sched);
+
+/* Adjust sequence number for start of update-side operation. */
+static void rcu_seq_start(unsigned long *sp)
+{
+ WRITE_ONCE(*sp, *sp + 1);
+ smp_mb(); /* Ensure update-side operation after counter increment. */
+ WARN_ON_ONCE(!(*sp & 0x1));
+}
+
+/* Adjust sequence number for end of update-side operation. */
+static void rcu_seq_end(unsigned long *sp)
+{
+ smp_mb(); /* Ensure update-side operation before counter increment. */
+ WRITE_ONCE(*sp, *sp + 1);
+ WARN_ON_ONCE(*sp & 0x1);
+}
+
+/* Take a snapshot of the update side's sequence number. */
+static unsigned long rcu_seq_snap(unsigned long *sp)
+{
+ unsigned long s;
+
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ s = (READ_ONCE(*sp) + 3) & ~0x1;
+ smp_mb(); /* Above access must not bleed into critical section. */
+ return s;
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+ return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
+
+/* Wrapper functions for expedited grace periods. */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+ rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+ rcu_seq_end(&rsp->expedited_sequence);
+ smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+ return rcu_seq_snap(&rsp->expedited_sequence);
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+ return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp,
+ atomic_long_t *stat, unsigned long s)
+{
+ if (rcu_exp_gp_seq_done(rsp, s)) {
+ if (rnp)
+ mutex_unlock(&rnp->exp_funnel_mutex);
+ else if (rdp)
+ mutex_unlock(&rdp->exp_funnel_mutex);
+ /* Ensure test happens before caller kfree(). */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(stat);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Funnel-lock acquisition for expedited grace periods. Returns a
+ * pointer to the root rcu_node structure, or NULL if some other
+ * task did the expedited grace period for us.
+ */
+static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp0;
+ struct rcu_node *rnp1 = NULL;
+
+ /*
+ * First try directly acquiring the root lock in order to reduce
+ * latency in the common case where expedited grace periods are
+ * rare. We check mutex_is_locked() to avoid pathological levels of
+ * memory contention on ->exp_funnel_mutex in the heavy-load case.
+ */
+ rnp0 = rcu_get_root(rsp);
+ if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
+ if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
+ if (sync_exp_work_done(rsp, rnp0, NULL,
+ &rsp->expedited_workdone0, s))
+ return NULL;
+ return rnp0;
+ }
+ }
+
+ /*
+ * Each pass through the following loop works its way
+ * up the rcu_node tree, returning if others have done the
+ * work or otherwise falls through holding the root rnp's
+ * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
+ * can be inexact, as it is just promoting locality and is not
+ * strictly needed for correctness.
*/
- smp_mb(); /* See above comment block. */
+ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+ if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+ return NULL;
+ mutex_lock(&rdp->exp_funnel_mutex);
+ rnp0 = rdp->mynode;
+ for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+ if (sync_exp_work_done(rsp, rnp1, rdp,
+ &rsp->expedited_workdone2, s))
+ return NULL;
+ mutex_lock(&rnp0->exp_funnel_mutex);
+ if (rnp1)
+ mutex_unlock(&rnp1->exp_funnel_mutex);
+ else
+ mutex_unlock(&rdp->exp_funnel_mutex);
+ rnp1 = rnp0;
+ }
+ if (sync_exp_work_done(rsp, rnp1, rdp,
+ &rsp->expedited_workdone3, s))
+ return NULL;
+ return rnp1;
+}
+
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+ struct rcu_data *rdp = data;
+ struct rcu_state *rsp = rdp->rsp;
+
+ /* We are here: If we are last, do the wakeup. */
+ rdp->exp_done = true;
+ if (atomic_dec_and_test(&rsp->expedited_need_qs))
+ wake_up(&rsp->expedited_wq);
return 0;
}
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long jiffies_stall;
+ unsigned long jiffies_start;
+ struct rcu_data *rdp;
+ int ret;
+
+ jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_start = jiffies;
+
+ for (;;) {
+ ret = wait_event_interruptible_timeout(
+ rsp->expedited_wq,
+ !atomic_read(&rsp->expedited_need_qs),
+ jiffies_stall);
+ if (ret > 0)
+ return;
+ if (ret < 0) {
+ /* Hit a signal, disable CPU stall warnings. */
+ wait_event(rsp->expedited_wq,
+ !atomic_read(&rsp->expedited_need_qs));
+ return;
+ }
+ pr_err("INFO: %s detected expedited stalls on CPUs: {",
+ rsp->name);
+ for_each_online_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (rdp->exp_done)
+ continue;
+ pr_cont(" %d", cpu);
+ }
+ pr_cont(" } %lu jiffies s: %lu\n",
+ jiffies - jiffies_start, rsp->expedited_sequence);
+ for_each_online_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (rdp->exp_done)
+ continue;
+ dump_cpu_task(cpu);
+ }
+ jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ }
+}
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3068,58 +3524,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* restructure your code to batch your updates, and then use a single
* synchronize_sched() instead.
*
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word. Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs. If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period. We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ unsigned long s;
+ struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state;
- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- synchronize_sched();
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap(rsp);
- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3128,100 +3547,38 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
+ rnp = exp_funnel_lock(rsp, s);
+ if (rnp == NULL) {
+ put_online_cpus();
+ return; /* Someone else did our work for us. */
}
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ rcu_exp_gp_seq_start(rsp);
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ init_waitqueue_head(&rsp->expedited_wq);
+ atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
+ for_each_online_cpu(cpu) {
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ rdp->exp_done = false;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* Skip our CPU and any idle CPUs. */
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+ atomic_inc(&rsp->expedited_need_qs);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+ rdp, &rdp->exp_stop_work);
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
-all_cpus_idle:
- free_cpumask_var(cm);
+ /* Remove extra count and, if necessary, wait for CPUs to stop. */
+ if (!atomic_dec_and_test(&rsp->expedited_need_qs))
+ synchronize_sched_expedited_wait(rsp);
- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+ rcu_exp_gp_seq_end(rsp);
+ mutex_unlock(&rnp->exp_funnel_mutex);
put_online_cpus();
}
@@ -3272,14 +3629,14 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
}
/* Has another RCU grace period completed? */
- if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
+ if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
rdp->n_rp_gp_completed++;
return 1;
}
/* Has a new RCU grace period started? */
- if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
- unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
+ if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
+ unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */
rdp->n_rp_gp_started++;
return 1;
}
@@ -3315,7 +3672,7 @@ static int rcu_pending(void)
* non-NULL, store an indication of whether all callbacks are lazy.
* (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
+static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
{
bool al = true;
bool hc = false;
@@ -3358,10 +3715,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
struct rcu_state *rsp = rdp->rsp;
if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
complete(&rsp->barrier_completion);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
}
}
@@ -3373,7 +3730,7 @@ static void rcu_barrier_func(void *type)
struct rcu_state *rsp = type;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
atomic_inc(&rsp->barrier_cpu_count);
rsp->call(&rdp->barrier_head, rcu_barrier_callback);
}
@@ -3386,55 +3743,24 @@ static void _rcu_barrier(struct rcu_state *rsp)
{
int cpu;
struct rcu_data *rdp;
- unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
- unsigned long snap_done;
+ unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Begin", -1, snap);
+ _rcu_barrier_trace(rsp, "Begin", -1, s);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rsp->barrier_mutex);
- /*
- * Ensure that all prior references, including to ->n_barrier_done,
- * are ordered before the _rcu_barrier() machinery.
- */
- smp_mb(); /* See above block comment. */
-
- /*
- * Recheck ->n_barrier_done to see if others did our work for us.
- * This means checking ->n_barrier_done for an even-to-odd-to-even
- * transition. The "if" expression below therefore rounds the old
- * value up to the next even number and adds two before comparing.
- */
- snap_done = rsp->n_barrier_done;
- _rcu_barrier_trace(rsp, "Check", -1, snap_done);
-
- /*
- * If the value in snap is odd, we needed to wait for the current
- * rcu_barrier() to complete, then wait for the next one, in other
- * words, we need the value of snap_done to be three larger than
- * the value of snap. On the other hand, if the value in snap is
- * even, we only had to wait for the next rcu_barrier() to complete,
- * in other words, we need the value of snap_done to be only two
- * greater than the value of snap. The "(snap + 3) & ~0x1" computes
- * this for us (thank you, Linus!).
- */
- if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
- _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+ /* Did someone else do our work for us? */
+ if (rcu_seq_done(&rsp->barrier_sequence, s)) {
+ _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rsp->barrier_mutex);
return;
}
- /*
- * Increment ->n_barrier_done to avoid duplicate work. Use
- * ACCESS_ONCE() to prevent the compiler from speculating
- * the increment to precede the early-exit check.
- */
- ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
- WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
- _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
- smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
+ /* Mark the start of the barrier operation. */
+ rcu_seq_start(&rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
/*
* Initialize the count to one rather than to zero in order to
@@ -3458,22 +3784,22 @@ static void _rcu_barrier(struct rcu_state *rsp)
if (rcu_is_nocb_cpu(cpu)) {
if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
_rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
} else {
_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
__call_rcu(&rdp->barrier_head,
rcu_barrier_callback, rsp, cpu, 0);
}
- } else if (ACCESS_ONCE(rdp->qlen)) {
+ } else if (READ_ONCE(rdp->qlen)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
} else {
_rcu_barrier_trace(rsp, "OnlineNQ", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
}
}
put_online_cpus();
@@ -3485,16 +3811,13 @@ static void _rcu_barrier(struct rcu_state *rsp)
if (atomic_dec_and_test(&rsp->barrier_cpu_count))
complete(&rsp->barrier_completion);
- /* Increment ->n_barrier_done to prevent duplicate work. */
- smp_mb(); /* Keep increment after above mechanism. */
- ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
- WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
- _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
- smp_mb(); /* Keep increment before caller's subsequent code. */
-
/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
wait_for_completion(&rsp->barrier_completion);
+ /* Mark the end of the barrier operation. */
+ _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+ rcu_seq_end(&rsp->barrier_sequence);
+
/* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rsp->barrier_mutex);
}
@@ -3518,6 +3841,28 @@ void rcu_barrier_sched(void)
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
/*
+ * Propagate ->qsinitmask bits up the rcu_node tree to account for the
+ * first CPU in a given leaf rcu_node structure coming online. The caller
+ * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * disabled.
+ */
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (rnp == NULL)
+ return;
+ raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+ rnp->qsmaskinit |= mask;
+ raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+ }
+}
+
+/*
* Do boot-time initialization of a CPU's per-CPU RCU data.
*/
static void __init
@@ -3535,6 +3880,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
rdp->cpu = cpu;
rdp->rsp = rsp;
+ mutex_init(&rdp->exp_funnel_mutex);
rcu_boot_init_nocb_percpu_data(rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -3553,49 +3899,37 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
- /* Exclude new grace periods. */
- mutex_lock(&rsp->onoff_mutex);
-
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->beenonline = 1; /* We have now been online. */
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
- init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
+ if (!rdp->nxtlist)
+ init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_sysidle_init_percpu_data(rdp->dynticks);
atomic_set(&rdp->dynticks->dynticks,
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- /* Add CPU to rcu_node bitmasks. */
+ /*
+ * Add CPU to leaf rcu_node pending-online bitmask. Any needed
+ * propagation up the rcu_node tree will happen at the beginning
+ * of the next grace period.
+ */
rnp = rdp->mynode;
mask = rdp->grpmask;
- do {
- /* Exclude any attempts to start a new GP on small systems. */
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rnp->qsmaskinit |= mask;
- mask = rnp->grpmask;
- if (rnp == rdp->mynode) {
- /*
- * If there is a grace period in progress, we will
- * set up to wait for it next time we run the
- * RCU core code.
- */
- rdp->gpnum = rnp->completed;
- rdp->completed = rnp->completed;
- rdp->passed_quiesce = 0;
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- rdp->qs_pending = 0;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
- }
- raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
- rnp = rnp->parent;
- } while (rnp != NULL && !(rnp->qsmaskinit & mask));
- local_irq_restore(flags);
-
- mutex_unlock(&rsp->onoff_mutex);
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ rnp->qsmaskinitnext |= mask;
+ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
+ rdp->completed = rnp->completed;
+ rdp->passed_quiesce = false;
+ rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
+ rdp->qs_pending = false;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
static void rcu_prepare_cpu(int cpu)
@@ -3609,15 +3943,14 @@ static void rcu_prepare_cpu(int cpu)
/*
* Handle CPU online/offline notification events.
*/
-static int rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+int rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
struct rcu_state *rsp;
- trace_rcu_utilization(TPS("Start CPU hotplug"));
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
@@ -3637,6 +3970,11 @@ static int rcu_cpu_notify(struct notifier_block *self,
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_cpu(rsp);
break;
+ case CPU_DYING_IDLE:
+ for_each_rcu_flavor(rsp) {
+ rcu_cleanup_dying_idle_cpu(cpu, rsp);
+ }
+ break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
@@ -3649,7 +3987,6 @@ static int rcu_cpu_notify(struct notifier_block *self,
default:
break;
}
- trace_rcu_utilization(TPS("End CPU hotplug"));
return NOTIFY_OK;
}
@@ -3660,11 +3997,12 @@ static int rcu_pm_notify(struct notifier_block *self,
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_expedited = 1;
+ rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
- rcu_expedited = 0;
+ if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+ rcu_unexpedite_gp();
break;
default:
break;
@@ -3732,32 +4070,28 @@ void rcu_scheduler_starting(void)
/*
* Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
*/
-#ifdef CONFIG_RCU_FANOUT_EXACT
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
+static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
{
int i;
- rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
- for (i = rcu_num_lvls - 2; i >= 0; i--)
- rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-}
-#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
-{
- int ccur;
- int cprv;
- int i;
-
- cprv = nr_cpu_ids;
- for (i = rcu_num_lvls - 1; i >= 0; i--) {
- ccur = rsp->levelcnt[i];
- rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
- cprv = ccur;
+ if (rcu_fanout_exact) {
+ levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ for (i = rcu_num_lvls - 2; i >= 0; i--)
+ levelspread[i] = RCU_FANOUT;
+ } else {
+ int ccur;
+ int cprv;
+
+ cprv = nr_cpu_ids;
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
+ ccur = levelcnt[i];
+ levelspread[i] = (cprv + ccur - 1) / ccur;
+ cprv = ccur;
+ }
}
}
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
/*
* Helper function for rcu_init() that initializes one rcu_state structure.
@@ -3765,44 +4099,41 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
static void __init rcu_init_one(struct rcu_state *rsp,
struct rcu_data __percpu *rda)
{
- static const char * const buf[] = {
- "rcu_node_0",
- "rcu_node_1",
- "rcu_node_2",
- "rcu_node_3" }; /* Match MAX_RCU_LVLS */
- static const char * const fqs[] = {
- "rcu_node_fqs_0",
- "rcu_node_fqs_1",
- "rcu_node_fqs_2",
- "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const buf[] = RCU_NODE_NAME_INIT;
+ static const char * const fqs[] = RCU_FQS_NAME_INIT;
+ static const char * const exp[] = RCU_EXP_NAME_INIT;
+ static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
static u8 fl_mask = 0x1;
+
+ int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
+ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
int cpustride = 1;
int i;
int j;
struct rcu_node *rnp;
- BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
+ BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
- /* Silence gcc 4.8 warning about array index out of range. */
- if (rcu_num_lvls > RCU_NUM_LVLS)
- panic("rcu_init_one: rcu_num_lvls overflow");
+ /* Silence gcc 4.8 false positive about array index out of range. */
+ if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
+ panic("rcu_init_one: rcu_num_lvls out of range");
/* Initialize the level-tracking arrays. */
for (i = 0; i < rcu_num_lvls; i++)
- rsp->levelcnt[i] = num_rcu_lvl[i];
+ levelcnt[i] = num_rcu_lvl[i];
for (i = 1; i < rcu_num_lvls; i++)
- rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
- rcu_init_levelspread(rsp);
+ rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
+ rcu_init_levelspread(levelspread, levelcnt);
rsp->flavor_mask = fl_mask;
fl_mask <<= 1;
/* Initialize the elements themselves, starting from the leaves. */
for (i = rcu_num_lvls - 1; i >= 0; i--) {
- cpustride *= rsp->levelspread[i];
+ cpustride *= levelspread[i];
rnp = rsp->level[i];
- for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+ for (j = 0; j < levelcnt[i]; j++, rnp++) {
raw_spin_lock_init(&rnp->lock);
lockdep_set_class_and_name(&rnp->lock,
&rcu_node_class[i], buf[i]);
@@ -3822,18 +4153,26 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->grpmask = 0;
rnp->parent = NULL;
} else {
- rnp->grpnum = j % rsp->levelspread[i - 1];
+ rnp->grpnum = j % levelspread[i - 1];
rnp->grpmask = 1UL << rnp->grpnum;
rnp->parent = rsp->level[i - 1] +
- j / rsp->levelspread[i - 1];
+ j / levelspread[i - 1];
}
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp);
+ mutex_init(&rnp->exp_funnel_mutex);
+ if (rsp == &rcu_sched_state)
+ lockdep_set_class_and_name(
+ &rnp->exp_funnel_mutex,
+ &rcu_exp_sched_class[i], exp_sched[i]);
+ else
+ lockdep_set_class_and_name(
+ &rnp->exp_funnel_mutex,
+ &rcu_exp_class[i], exp[i]);
}
}
- rsp->rda = rda;
init_waitqueue_head(&rsp->gp_wq);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
@@ -3854,9 +4193,7 @@ static void __init rcu_init_geometry(void)
{
ulong d;
int i;
- int j;
- int n = nr_cpu_ids;
- int rcu_capacity[MAX_RCU_LVLS + 1];
+ int rcu_capacity[RCU_NUM_LVLS];
/*
* Initialize any unspecified boot parameters.
@@ -3872,64 +4209,92 @@ static void __init rcu_init_geometry(void)
jiffies_till_next_fqs = d;
/* If the compile-time values are accurate, just leave. */
- if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
+ if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
nr_cpu_ids == NR_CPUS)
return;
pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
rcu_fanout_leaf, nr_cpu_ids);
/*
- * Compute number of nodes that can be handled an rcu_node tree
- * with the given number of levels. Setting rcu_capacity[0] makes
- * some of the arithmetic easier.
- */
- rcu_capacity[0] = 1;
- rcu_capacity[1] = rcu_fanout_leaf;
- for (i = 2; i <= MAX_RCU_LVLS; i++)
- rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
-
- /*
* The boot-time rcu_fanout_leaf parameter is only permitted
* to increase the leaf-level fanout, not decrease it. Of course,
* the leaf-level fanout cannot exceed the number of bits in
- * the rcu_node masks. Finally, the tree must be able to accommodate
- * the configured number of CPUs. Complain and fall back to the
- * compile-time values if these limits are exceeded.
+ * the rcu_node masks. Complain and fall back to the compile-
+ * time values if these limits are exceeded.
*/
- if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
- rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
- n > rcu_capacity[MAX_RCU_LVLS]) {
+ if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
+ rcu_fanout_leaf > sizeof(unsigned long) * 8) {
+ rcu_fanout_leaf = RCU_FANOUT_LEAF;
WARN_ON(1);
return;
}
+ /*
+ * Compute number of nodes that can be handled an rcu_node tree
+ * with the given number of levels.
+ */
+ rcu_capacity[0] = rcu_fanout_leaf;
+ for (i = 1; i < RCU_NUM_LVLS; i++)
+ rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
+
+ /*
+ * The tree must be able to accommodate the configured number of CPUs.
+ * If this limit is exceeded than we have a serious problem elsewhere.
+ */
+ if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1])
+ panic("rcu_init_geometry: rcu_capacity[] is too small");
+
+ /* Calculate the number of levels in the tree. */
+ for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
+ }
+ rcu_num_lvls = i + 1;
+
/* Calculate the number of rcu_nodes at each level of the tree. */
- for (i = 1; i <= MAX_RCU_LVLS; i++)
- if (n <= rcu_capacity[i]) {
- for (j = 0; j <= i; j++)
- num_rcu_lvl[j] =
- DIV_ROUND_UP(n, rcu_capacity[i - j]);
- rcu_num_lvls = i;
- for (j = i + 1; j <= MAX_RCU_LVLS; j++)
- num_rcu_lvl[j] = 0;
- break;
- }
+ for (i = 0; i < rcu_num_lvls; i++) {
+ int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
+ num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
+ }
/* Calculate the total number of rcu_node structures. */
rcu_num_nodes = 0;
- for (i = 0; i <= MAX_RCU_LVLS; i++)
+ for (i = 0; i < rcu_num_lvls; i++)
rcu_num_nodes += num_rcu_lvl[i];
- rcu_num_nodes -= n;
+}
+
+/*
+ * Dump out the structure of the rcu_node combining tree associated
+ * with the rcu_state structure referenced by rsp.
+ */
+static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
+{
+ int level = 0;
+ struct rcu_node *rnp;
+
+ pr_info("rcu_node tree layout dump\n");
+ pr_info(" ");
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (rnp->level != level) {
+ pr_cont("\n");
+ pr_info(" ");
+ level = rnp->level;
+ }
+ pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum);
+ }
+ pr_cont("\n");
}
void __init rcu_init(void)
{
int cpu;
+ rcu_early_boot_tests();
+
rcu_bootup_announce();
rcu_init_geometry();
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ if (dump_tree)
+ rcu_dump_rcu_node_tree(&rcu_sched_state);
__rcu_init_preempt();
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
@@ -3942,8 +4307,6 @@ void __init rcu_init(void)
pm_notifier(rcu_pm_notify, 0);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-
- rcu_early_boot_tests();
}
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 119de399eb2f..2e991f8361e4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#include <linux/stop_machine.h>
/*
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -35,47 +36,82 @@
* In practice, this did work well going from three levels to four.
* Of course, your mileage may vary.
*/
-#define MAX_RCU_LVLS 4
-#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#ifdef CONFIG_RCU_FANOUT
+#define RCU_FANOUT CONFIG_RCU_FANOUT
+#else /* #ifdef CONFIG_RCU_FANOUT */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT 64
+# else
+# define RCU_FANOUT 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT */
+
+#ifdef CONFIG_RCU_FANOUT_LEAF
+#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT_LEAF 64
+# else
+# define RCU_FANOUT_LEAF 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
+
+#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
+#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
+#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
#if NR_CPUS <= RCU_FANOUT_1
# define RCU_NUM_LVLS 1
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 (NR_CPUS)
-# define NUM_RCU_LVL_2 0
-# define NUM_RCU_LVL_3 0
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES NUM_RCU_LVL_0
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0" }
#elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_2 (NR_CPUS)
-# define NUM_RCU_LVL_3 0
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" }
#elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_3 (NR_CPUS)
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" }
#elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_4 (NR_CPUS)
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" }
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
-#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
-
extern int rcu_num_lvls;
extern int rcu_num_nodes;
@@ -141,12 +177,20 @@ struct rcu_node {
/* complete (only for PREEMPT_RCU). */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
+ /* Initialized from ->qsmaskinitnext at the */
+ /* beginning of each grace period. */
+ unsigned long qsmaskinitnext;
+ /* Online CPUs for next grace period. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
int grphi; /* highest-numbered CPU or group here. */
u8 grpnum; /* CPU/group number for next level up. */
u8 level; /* root is at level 0. */
+ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
+ /* exit RCU read-side critical sections */
+ /* before propagating offline up the */
+ /* rcu_node tree? */
struct rcu_node *parent;
struct list_head blkd_tasks;
/* Tasks blocked in RCU read-side critical */
@@ -162,7 +206,6 @@ struct rcu_node {
/* if there is no such task. If there */
/* is no current expedited grace period, */
/* then there can cannot be any such task. */
-#ifdef CONFIG_RCU_BOOST
struct list_head *boost_tasks;
/* Pointer to first task that needs to be */
/* priority boosted, or NULL if no priority */
@@ -200,7 +243,6 @@ struct rcu_node {
unsigned long n_balk_nos;
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
wait_queue_head_t nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -208,6 +250,8 @@ struct rcu_node {
int need_future_gp[2];
/* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+
+ struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;
/*
@@ -259,12 +303,13 @@ struct rcu_data {
bool gpwrap; /* Possible gpnum/completed wrap. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
unsigned long ticks_this_gp; /* The number of scheduling-clock */
/* ticks this CPU has handled */
/* during and after the last grace */
/* period it is aware of. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+ struct cpu_stop_work exp_stop_work;
+ /* Expedited grace-period control */
+ /* for CPU stopping. */
/* 2) batch handling */
/*
@@ -327,11 +372,13 @@ struct rcu_data {
unsigned long n_rp_nocb_defer_wakeup;
unsigned long n_rp_need_nothing;
- /* 6) _rcu_barrier() and OOM callbacks. */
+ /* 6) _rcu_barrier(), OOM callbacks, and expediting. */
struct rcu_head barrier_head;
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+ struct mutex exp_funnel_mutex;
+ bool exp_done; /* Expedited QS for this CPU? */
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -359,9 +406,7 @@ struct rcu_data {
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 8) RCU CPU stall data. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
unsigned int softirq_snap; /* Snapshot of softirq activity. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
int cpu;
struct rcu_state *rsp;
@@ -414,9 +459,9 @@ do { \
*/
struct rcu_state {
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
- struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
- u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
- u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
+ struct rcu_node *level[RCU_NUM_LVLS + 1];
+ /* Hierarchy levels (+1 to */
+ /* shut bogus gcc warning) */
u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
@@ -448,26 +493,21 @@ struct rcu_state {
long qlen; /* Total number of callbacks. */
/* End of fields guarded by orphan_lock. */
- struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
-
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
- unsigned long n_barrier_done; /* ++ at start and end of */
+ unsigned long barrier_sequence; /* ++ at start and end of */
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
- atomic_long_t expedited_tryfail; /* # acquisition failures. */
+ unsigned long expedited_sequence; /* Take a ticket. */
+ atomic_long_t expedited_workdone0; /* # done by others #0. */
atomic_long_t expedited_workdone1; /* # done by others #1. */
atomic_long_t expedited_workdone2; /* # done by others #2. */
+ atomic_long_t expedited_workdone3; /* # done by others #3. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+ atomic_t expedited_need_qs; /* # CPUs left to check in. */
+ wait_queue_head_t expedited_wq; /* Wait for check-ins. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
@@ -501,7 +541,11 @@ struct rcu_state {
/* Values for rcu_state structure's gp_flags field. */
#define RCU_GP_WAIT_INIT 0 /* Initial state. */
#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
-#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
+#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */
+#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */
+#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */
+#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */
+#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */
extern struct list_head rcu_struct_flavors;
@@ -513,14 +557,11 @@ extern struct list_head rcu_struct_flavors;
* RCU implementation internal declarations:
*/
extern struct rcu_state rcu_sched_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
extern struct rcu_state rcu_bh_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
#ifdef CONFIG_PREEMPT_RCU
extern struct rcu_state rcu_preempt_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
@@ -559,6 +600,7 @@ static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(void);
static void rcu_prepare_for_idle(void);
static void rcu_idle_count_callbacks_posted(void);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static void print_cpu_stall_info_begin(void);
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
@@ -611,3 +653,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
}
#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Place this after a lock-acquisition primitive to guarantee that
+ * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies
+ * if the UNLOCK and LOCK are executed by the same CPU or if the
+ * UNLOCK and LOCK operate on the same lock variable.
+ */
+#ifdef CONFIG_PPC
+#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
+#else /* #ifdef CONFIG_PPC */
+#define smp_mb__after_unlock_lock() do { } while (0)
+#endif /* #else #ifdef CONFIG_PPC */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..b2bf3963a0ae 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -43,7 +43,17 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
-#endif /* #ifdef CONFIG_RCU_BOOST */
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
+ * all uses are in dead code. Provide a definition to keep the compiler
+ * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
+ * This probably needs to be excluded from -rt builds.
+ */
+#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -58,44 +68,38 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
*/
static void __init rcu_bootup_announce_oddness(void)
{
-#ifdef CONFIG_RCU_TRACE
- pr_info("\tRCU debugfs-based tracing is enabled.\n");
-#endif
-#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
- pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
- CONFIG_RCU_FANOUT);
-#endif
-#ifdef CONFIG_RCU_FANOUT_EXACT
- pr_info("\tHierarchical RCU autobalancing is disabled.\n");
-#endif
-#ifdef CONFIG_RCU_FAST_NO_HZ
- pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
-#endif
-#ifdef CONFIG_PROVE_RCU
- pr_info("\tRCU lockdep checking is enabled.\n");
-#endif
-#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
- pr_info("\tRCU torture testing starts during boot.\n");
-#endif
-#if defined(CONFIG_RCU_CPU_STALL_INFO)
- pr_info("\tAdditional per-CPU info printed with stalls.\n");
-#endif
-#if NUM_RCU_LVL_4 != 0
- pr_info("\tFour-level hierarchy is enabled.\n");
-#endif
- if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+ if (IS_ENABLED(CONFIG_RCU_TRACE))
+ pr_info("\tRCU debugfs-based tracing is enabled.\n");
+ if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
+ (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
+ pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+ RCU_FANOUT);
+ if (rcu_fanout_exact)
+ pr_info("\tHierarchical RCU autobalancing is disabled.\n");
+ if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
+ pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ pr_info("\tRCU lockdep checking is enabled.\n");
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
+ pr_info("\tRCU torture testing starts during boot.\n");
+ if (RCU_NUM_LVLS >= 4)
+ pr_info("\tFour(or more)-level hierarchy is enabled.\n");
+ if (RCU_FANOUT_LEAF != 16)
+ pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
+ RCU_FANOUT_LEAF);
+ if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
-#ifdef CONFIG_RCU_BOOST
- pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
-#endif
+ if (IS_ENABLED(CONFIG_RCU_BOOST))
+ pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
}
#ifdef CONFIG_PREEMPT_RCU
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *rcu_state_p = &rcu_preempt_state;
+static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -121,11 +125,11 @@ static void __init rcu_bootup_announce(void)
*/
static void rcu_preempt_qs(void)
{
- if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+ if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
- __this_cpu_read(rcu_preempt_data.gpnum),
+ __this_cpu_read(rcu_data_p->gpnum),
TPS("cpuqs"));
- __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
+ __this_cpu_write(rcu_data_p->passed_quiesce, 1);
barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
current->rcu_read_unlock_special.b.need_qs = false;
}
@@ -155,7 +159,7 @@ static void rcu_preempt_note_context_switch(void)
!t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = this_cpu_ptr(rcu_preempt_state.rda);
+ rdp = this_cpu_ptr(rcu_state_p->rda);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
@@ -180,15 +184,14 @@ static void rcu_preempt_note_context_switch(void)
* But first, note that the current CPU must still be
* on line!
*/
- WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
+ WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
rnp->gp_tasks = &t->rcu_node_entry;
-#ifdef CONFIG_RCU_BOOST
- if (rnp->boost_tasks != NULL)
+ if (IS_ENABLED(CONFIG_RCU_BOOST) &&
+ rnp->boost_tasks != NULL)
rnp->boost_tasks = rnp->gp_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
} else {
list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
if (rnp->qsmask & rdp->grpmask)
@@ -233,43 +236,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
}
/*
- * Record a quiescent state for all tasks that were previously queued
- * on the specified rcu_node structure and that were blocking the current
- * RCU grace period. The caller must hold the specified rnp->lock with
- * irqs disabled, and this lock is released upon return, but irqs remain
- * disabled.
- */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
- __releases(rnp->lock)
-{
- unsigned long mask;
- struct rcu_node *rnp_p;
-
- if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return; /* Still need more quiescent states! */
- }
-
- rnp_p = rnp->parent;
- if (rnp_p == NULL) {
- /*
- * Either there is only one rcu_node in the tree,
- * or tasks were kicked up to root rcu_node due to
- * CPUs going offline.
- */
- rcu_report_qs_rsp(&rcu_preempt_state, flags);
- return;
- }
-
- /* Report up the rest of the hierarchy. */
- mask = rnp->grpmask;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
-}
-
-/*
* Advance a ->blkd_tasks-list pointer to the next entry, instead
* returning NULL if at the end of the list.
*/
@@ -300,15 +266,12 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
*/
void rcu_read_unlock_special(struct task_struct *t)
{
- bool empty;
bool empty_exp;
bool empty_norm;
bool empty_exp_now;
unsigned long flags;
struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
bool drop_boost_mutex = false;
-#endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
union rcu_special special;
@@ -334,7 +297,13 @@ void rcu_read_unlock_special(struct task_struct *t)
}
/* Hardware IRQ handlers cannot block, complain if they get here. */
- if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
+ if (in_irq() || in_serving_softirq()) {
+ lockdep_rcu_suspicious(__FILE__, __LINE__,
+ "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
+ pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
+ t->rcu_read_unlock_special.s,
+ t->rcu_read_unlock_special.b.blocked,
+ t->rcu_read_unlock_special.b.need_qs);
local_irq_restore(flags);
return;
}
@@ -344,9 +313,11 @@ void rcu_read_unlock_special(struct task_struct *t)
t->rcu_read_unlock_special.b.blocked = false;
/*
- * Remove this task from the list it blocked on. The
- * task can migrate while we acquire the lock, but at
- * most one time. So at most two passes through loop.
+ * Remove this task from the list it blocked on. The task
+ * now remains queued on the rcu_node corresponding to
+ * the CPU it first blocked on, so the first attempt to
+ * acquire the task's rcu_node's ->lock will succeed.
+ * Keep the loop and add a WARN_ON() out of sheer paranoia.
*/
for (;;) {
rnp = t->rcu_blocked_node;
@@ -354,9 +325,9 @@ void rcu_read_unlock_special(struct task_struct *t)
smp_mb__after_unlock_lock();
if (rnp == t->rcu_blocked_node)
break;
+ WARN_ON_ONCE(1);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
- empty = !rcu_preempt_has_tasks(rnp);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = !rcu_preempted_readers_exp(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -369,20 +340,12 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp->gp_tasks = np;
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp->boost_tasks = np;
- /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
- drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
- /*
- * If this was the last task on the list, go see if we
- * need to propagate ->qsmaskinit bit clearing up the
- * rcu_node tree.
- */
- if (!empty && !rcu_preempt_has_tasks(rnp))
- rcu_cleanup_dead_rnp(rnp);
+ if (IS_ENABLED(CONFIG_RCU_BOOST)) {
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
+ /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ }
/*
* If this was the last task on the current list, and if
@@ -399,23 +362,21 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp->grplo,
rnp->grphi,
!!rnp->gp_tasks);
- rcu_report_unblock_qs_rnp(rnp, flags);
+ rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
} else {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
-#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
- if (drop_boost_mutex)
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
rt_mutex_unlock(&rnp->boost_mtx);
-#endif /* #ifdef CONFIG_RCU_BOOST */
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
- rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+ rcu_report_exp_rnp(rcu_state_p, rnp, true);
} else {
local_irq_restore(flags);
}
@@ -435,7 +396,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- t = list_entry(rnp->gp_tasks,
+ t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
sched_show_task(t);
@@ -455,8 +416,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
rcu_print_detail_task_stall_rnp(rnp);
}
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-
static void rcu_print_task_stall_begin(struct rcu_node *rnp)
{
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
@@ -468,18 +427,6 @@ static void rcu_print_task_stall_end(void)
pr_cont("\n");
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-}
-
-static void rcu_print_task_stall_end(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
/*
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each.
@@ -492,7 +439,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
if (!rcu_preempt_blocked_readers_cgp(rnp))
return 0;
rcu_print_task_stall_begin(rnp);
- t = list_entry(rnp->gp_tasks,
+ t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
pr_cont(" P%d", t->pid);
@@ -520,10 +467,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
WARN_ON_ONCE(rnp->qsmask);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Check for a quiescent state from the current CPU. When a task blocks,
* the task is recorded in the corresponding CPU's rcu_node structure,
@@ -540,8 +483,8 @@ static void rcu_preempt_check_callbacks(void)
return;
}
if (t->rcu_read_lock_nesting > 0 &&
- __this_cpu_read(rcu_preempt_data.qs_pending) &&
- !__this_cpu_read(rcu_preempt_data.passed_quiesce))
+ __this_cpu_read(rcu_data_p->qs_pending) &&
+ !__this_cpu_read(rcu_data_p->passed_quiesce))
t->rcu_read_unlock_special.b.need_qs = true;
}
@@ -549,7 +492,7 @@ static void rcu_preempt_check_callbacks(void)
static void rcu_preempt_do_callbacks(void)
{
- rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+ rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
}
#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -559,7 +502,7 @@ static void rcu_preempt_do_callbacks(void)
*/
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
- __call_rcu(head, func, &rcu_preempt_state, -1, 0);
+ __call_rcu(head, func, rcu_state_p, -1, 0);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -579,13 +522,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
*/
void synchronize_rcu(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu() in RCU read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
if (!rcu_scheduler_active)
return;
- if (rcu_expedited)
+ if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
wait_rcu_gp(call_rcu);
@@ -593,8 +536,6 @@ void synchronize_rcu(void)
EXPORT_SYMBOL_GPL(synchronize_rcu);
static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static unsigned long sync_rcu_preempt_exp_count;
-static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
/*
* Return non-zero if there are any tasks in RCU read-side critical
@@ -614,12 +555,12 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
* for the current expedited grace period. Works only for preemptible
* RCU -- other RCU implementation use other means.
*
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
return !rcu_preempted_readers_exp(rnp) &&
- ACCESS_ONCE(rnp->expmask) == 0;
+ READ_ONCE(rnp->expmask) == 0;
}
/*
@@ -630,10 +571,7 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
- * Most callers will set the "wake" flag, but the task initiating the
- * expedited grace period need not wake itself.
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake)
@@ -667,29 +605,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
/*
* Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure. If there are no such
- * tasks, report it up the rcu_node hierarchy.
+ * grace period for the specified rcu_node structure, phase 1. If there
+ * are such tasks, set the ->expmask bits up the rcu_node tree and also
+ * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
+ * that work is needed here.
*
- * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
- * CPU hotplug operations.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void
-sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
+sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
{
unsigned long flags;
- int must_wait = 0;
+ unsigned long mask;
+ struct rcu_node *rnp_up;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
+ WARN_ON_ONCE(rnp->expmask);
+ WARN_ON_ONCE(rnp->exp_tasks);
if (!rcu_preempt_has_tasks(rnp)) {
+ /* No blocked tasks, nothing to do. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- } else {
+ return;
+ }
+ /* Call for Phase 2 and propagate ->expmask bits up the tree. */
+ rnp->expmask = 1;
+ rnp_up = rnp;
+ while (rnp_up->parent) {
+ mask = rnp_up->grpmask;
+ rnp_up = rnp_up->parent;
+ if (rnp_up->expmask & mask)
+ break;
+ raw_spin_lock(&rnp_up->lock); /* irqs already off */
+ smp_mb__after_unlock_lock();
+ rnp_up->expmask |= mask;
+ raw_spin_unlock(&rnp_up->lock); /* irqs still off */
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure, phase 2. If the
+ * leaf rcu_node structure has its ->expmask field set, check for tasks.
+ * If there are some, clear ->expmask and set ->exp_tasks accordingly,
+ * then initiate RCU priority boosting. Otherwise, clear ->expmask and
+ * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
+ * enabling rcu_read_unlock_special() to do the bit-clearing.
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void
+sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (!rnp->expmask) {
+ /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+
+ /* Phase 1 is over. */
+ rnp->expmask = 0;
+
+ /*
+ * If there are still blocked tasks, set up ->exp_tasks so that
+ * rcu_read_unlock_special() will wake us and then boost them.
+ */
+ if (rcu_preempt_has_tasks(rnp)) {
rnp->exp_tasks = rnp->blkd_tasks.next;
rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
- must_wait = 1;
+ return;
}
- if (!must_wait)
- rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
+
+ /* No longer any blocked tasks, so undo bit setting. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rcu_report_exp_rnp(rsp, rnp, false);
}
/**
@@ -706,72 +700,32 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
*/
void synchronize_rcu_expedited(void)
{
- unsigned long flags;
struct rcu_node *rnp;
- struct rcu_state *rsp = &rcu_preempt_state;
- unsigned long snap;
- int trycount = 0;
+ struct rcu_node *rnp_unlock;
+ struct rcu_state *rsp = rcu_state_p;
+ unsigned long s;
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
- snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
- smp_mb(); /* Above access cannot bleed into critical section. */
+ s = rcu_exp_gp_seq_snap(rsp);
- /*
- * Block CPU-hotplug operations. This means that any CPU-hotplug
- * operation that finds an rcu_node structure with tasks in the
- * process of being boosted will know that all tasks blocking
- * this expedited grace period will already be in the process of
- * being boosted. This simplifies the process of moving tasks
- * from leaf to root rcu_node structures.
- */
- if (!try_get_online_cpus()) {
- /* CPU-hotplug operation in flight, fall back to normal GP. */
- wait_rcu_gp(call_rcu);
- return;
- }
+ rnp_unlock = exp_funnel_lock(rsp, s);
+ if (rnp_unlock == NULL)
+ return; /* Someone else did our work for us. */
- /*
- * Acquire lock, falling back to synchronize_rcu() if too many
- * lock-acquisition failures. Of course, if someone does the
- * expedited grace period for us, just leave.
- */
- while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
- if (ULONG_CMP_LT(snap,
- ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
- put_online_cpus();
- goto mb_ret; /* Others did our work for us. */
- }
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- put_online_cpus();
- wait_rcu_gp(call_rcu);
- return;
- }
- }
- if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
- put_online_cpus();
- goto unlock_mb_ret; /* Others did our work for us. */
- }
+ rcu_exp_gp_seq_start(rsp);
/* force all RCU readers onto ->blkd_tasks lists. */
synchronize_sched_expedited();
- /* Initialize ->expmask for all non-leaf rcu_node structures. */
- rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- rnp->expmask = rnp->qsmaskinit;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- }
-
- /* Snapshot current state of ->blkd_tasks lists. */
+ /*
+ * Snapshot current state of ->blkd_tasks lists into ->expmask.
+ * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
+ * to start clearing them. Doing this in one phase leads to
+ * strange races between setting and clearing bits, so just say "no"!
+ */
rcu_for_each_leaf_node(rsp, rnp)
- sync_rcu_preempt_exp_init(rsp, rnp);
- if (NUM_RCU_NODES > 1)
- sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
-
- put_online_cpus();
+ sync_rcu_preempt_exp_init1(rsp, rnp);
+ rcu_for_each_leaf_node(rsp, rnp)
+ sync_rcu_preempt_exp_init2(rsp, rnp);
/* Wait for snapshotted ->blkd_tasks lists to drain. */
rnp = rcu_get_root(rsp);
@@ -779,13 +733,8 @@ void synchronize_rcu_expedited(void)
sync_rcu_preempt_exp_done(rnp));
/* Clean up and exit. */
- smp_mb(); /* ensure expedited GP seen before counter increment. */
- ACCESS_ONCE(sync_rcu_preempt_exp_count) =
- sync_rcu_preempt_exp_count + 1;
-unlock_mb_ret:
- mutex_unlock(&sync_rcu_preempt_exp_mutex);
-mb_ret:
- smp_mb(); /* ensure subsequent action seen after grace period. */
+ rcu_exp_gp_seq_end(rsp);
+ mutex_unlock(&rnp_unlock->exp_funnel_mutex);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -799,7 +748,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
*/
void rcu_barrier(void)
{
- _rcu_barrier(&rcu_preempt_state);
+ _rcu_barrier(rcu_state_p);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
@@ -808,7 +757,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
*/
static void __init __rcu_init_preempt(void)
{
- rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
+ rcu_init_one(rcu_state_p, rcu_data_p);
}
/*
@@ -831,7 +780,8 @@ void exit_rcu(void)
#else /* #ifdef CONFIG_PREEMPT_RCU */
-static struct rcu_state *rcu_state_p = &rcu_sched_state;
+static struct rcu_state *const rcu_state_p = &rcu_sched_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
/*
* Tell them what RCU they are running.
@@ -859,8 +809,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
return 0;
}
-#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Because there is no preemptible RCU, there can be no readers blocked.
*/
@@ -869,8 +817,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
return false;
}
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
@@ -998,8 +944,8 @@ static int rcu_boost(struct rcu_node *rnp)
struct task_struct *t;
struct list_head *tb;
- if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
- ACCESS_ONCE(rnp->boost_tasks) == NULL)
+ if (READ_ONCE(rnp->exp_tasks) == NULL &&
+ READ_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1052,13 +998,12 @@ static int rcu_boost(struct rcu_node *rnp)
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
- return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
- ACCESS_ONCE(rnp->boost_tasks) != NULL;
+ return READ_ONCE(rnp->exp_tasks) != NULL ||
+ READ_ONCE(rnp->boost_tasks) != NULL;
}
/*
- * Priority-boosting kthread. One per leaf rcu_node and one for the
- * root rcu_node.
+ * Priority-boosting kthread, one per leaf rcu_node.
*/
static int rcu_boost_kthread(void *arg)
{
@@ -1170,17 +1115,17 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
* Returns zero if all is well, a negated errno otherwise.
*/
static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp)
+ struct rcu_node *rnp)
{
int rnp_index = rnp - &rsp->node[0];
unsigned long flags;
struct sched_param sp;
struct task_struct *t;
- if (&rcu_preempt_state != rsp)
+ if (rcu_state_p != rsp)
return 0;
- if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+ if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
return 0;
rsp->boost = 1;
@@ -1273,7 +1218,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask = rnp->qsmaskinit;
+ unsigned long mask = rcu_rnp_online_cpus(rnp);
cpumask_var_t cm;
int cpu;
@@ -1371,13 +1316,12 @@ static void rcu_prepare_kthreads(int cpu)
* Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
* any flavor of RCU.
*/
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *delta_jiffies)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
- *delta_jiffies = ULONG_MAX;
- return rcu_cpu_has_callbacks(NULL);
+ *nextevt = KTIME_MAX;
+ return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
+ ? 0 : rcu_cpu_has_callbacks(NULL);
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1436,8 +1380,6 @@ module_param(rcu_idle_gp_delay, int, 0644);
static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
module_param(rcu_idle_lazy_gp_delay, int, 0644);
-extern int tick_nohz_active;
-
/*
* Try to advance callbacks for all flavors of RCU on the current CPU, but
* only if it has been awhile since the last time we did so. Afterwards,
@@ -1466,7 +1408,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
* callbacks not yet ready to invoke.
*/
if ((rdp->completed != rnp->completed ||
- unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
+ unlikely(READ_ONCE(rdp->gpwrap))) &&
rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
note_gp_changes(rsp, rdp);
@@ -1484,17 +1426,22 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
*
* The caller must have disabled interrupts.
*/
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *dj)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ unsigned long dj;
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
+ *nextevt = KTIME_MAX;
+ return 0;
+ }
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
/* If no callbacks, RCU doesn't need the CPU. */
if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
- *dj = ULONG_MAX;
+ *nextevt = KTIME_MAX;
return 0;
}
@@ -1508,14 +1455,14 @@ int rcu_needs_cpu(unsigned long *dj)
/* Request timer delay depending on laziness, and round. */
if (!rdtp->all_lazy) {
- *dj = round_up(rcu_idle_gp_delay + jiffies,
+ dj = round_up(rcu_idle_gp_delay + jiffies,
rcu_idle_gp_delay) - jiffies;
} else {
- *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+ dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
}
+ *nextevt = basemono + dj * TICK_NSEC;
return 0;
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Prepare a CPU for idle from an RCU perspective. The first major task
@@ -1529,7 +1476,6 @@ int rcu_needs_cpu(unsigned long *dj)
*/
static void rcu_prepare_for_idle(void)
{
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
bool needwake;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@ -1537,8 +1483,11 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+ return;
+
/* Handle nohz enablement switches conservatively. */
- tne = ACCESS_ONCE(tick_nohz_active);
+ tne = READ_ONCE(tick_nohz_active);
if (tne != rdtp->tick_nohz_enabled_snap) {
if (rcu_cpu_has_callbacks(NULL))
invoke_rcu_core(); /* force nohz to see update. */
@@ -1584,7 +1533,6 @@ static void rcu_prepare_for_idle(void)
if (needwake)
rcu_gp_kthread_wake(rsp);
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
/*
@@ -1594,12 +1542,11 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
- if (rcu_is_nocb_cpu(smp_processor_id()))
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+ rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
/*
@@ -1674,12 +1621,10 @@ static int rcu_oom_notify(struct notifier_block *self,
*/
atomic_set(&oom_callback_count, 1);
- get_online_cpus();
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
cond_resched_rcu_qs();
}
- put_online_cpus();
/* Unconditionally decrement: no need to wake ourselves up. */
atomic_dec(&oom_callback_count);
@@ -1700,8 +1645,6 @@ early_initcall(rcu_register_oom_notifier);
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-
#ifdef CONFIG_RCU_FAST_NO_HZ
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
@@ -1764,7 +1707,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
- ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
+ READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
fast_no_hz);
}
@@ -1790,33 +1733,6 @@ static void increment_cpu_stall_ticks(void)
raw_cpu_inc(rsp->rda->ticks_this_gp);
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
-static void print_cpu_stall_info_begin(void)
-{
- pr_cont(" {");
-}
-
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
-{
- pr_cont(" %d", cpu);
-}
-
-static void print_cpu_stall_info_end(void)
-{
- pr_cont("} ");
-}
-
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
-}
-
-static void increment_cpu_stall_ticks(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
#ifdef CONFIG_RCU_NOCB_CPU
/*
@@ -1902,11 +1818,11 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
{
struct rcu_data *rdp_leader = rdp->nocb_leader;
- if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+ if (!READ_ONCE(rdp_leader->nocb_kthread))
return;
- if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
- ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
wake_up(&rdp_leader->nocb_wq);
}
}
@@ -1938,14 +1854,15 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
ret = atomic_long_read(&rdp->nocb_q_count);
#ifdef CONFIG_PROVE_RCU
- rhp = ACCESS_ONCE(rdp->nocb_head);
+ rhp = READ_ONCE(rdp->nocb_head);
if (!rhp)
- rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+ rhp = READ_ONCE(rdp->nocb_gp_head);
if (!rhp)
- rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+ rhp = READ_ONCE(rdp->nocb_follower_head);
/* Having no rcuo kthread but CBs after scheduler starts is bad! */
- if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
+ if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
+ rcu_scheduler_fully_active) {
/* RCU callback enqueued before CPU first came online??? */
pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
cpu, rhp->func);
@@ -1978,12 +1895,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
atomic_long_add(rhcount, &rdp->nocb_q_count);
/* rcu_barrier() relies on ->nocb_q_count add before xchg. */
old_rhpp = xchg(&rdp->nocb_tail, rhtp);
- ACCESS_ONCE(*old_rhpp) = rhp;
+ WRITE_ONCE(*old_rhpp, rhp);
atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
/* If we are not being polled and there is a kthread, awaken it ... */
- t = ACCESS_ONCE(rdp->nocb_kthread);
+ t = READ_ONCE(rdp->nocb_kthread);
if (rcu_nocb_poll || !t) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeNotPoll"));
@@ -2121,7 +2038,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
for (;;) {
wait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
- (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+ (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
if (likely(d))
break;
WARN_ON(signal_pending(current));
@@ -2148,7 +2065,7 @@ wait_again:
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
wait_event_interruptible(my_rdp->nocb_wq,
- !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+ !READ_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2162,12 +2079,12 @@ wait_again:
*/
gotcbs = false;
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+ rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
continue; /* No CBs here, try next follower. */
/* Move callbacks to wait-for-GP list, which is empty. */
- ACCESS_ONCE(rdp->nocb_head) = NULL;
+ WRITE_ONCE(rdp->nocb_head, NULL);
rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
gotcbs = true;
}
@@ -2187,7 +2104,7 @@ wait_again:
my_rdp->nocb_leader_sleep = true;
smp_mb(); /* Ensure _sleep true before scan. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
- if (ACCESS_ONCE(rdp->nocb_head)) {
+ if (READ_ONCE(rdp->nocb_head)) {
/* Found CB, so short-circuit next wait. */
my_rdp->nocb_leader_sleep = false;
break;
@@ -2208,7 +2125,7 @@ wait_again:
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (ACCESS_ONCE(rdp->nocb_head))
+ if (READ_ONCE(rdp->nocb_head))
my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
@@ -2244,7 +2161,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"FollowerSleep");
wait_event_interruptible(rdp->nocb_wq,
- ACCESS_ONCE(rdp->nocb_follower_head));
+ READ_ONCE(rdp->nocb_follower_head));
} else if (firsttime) {
/* Don't drown trace log with "Poll"! */
firsttime = false;
@@ -2285,10 +2202,10 @@ static int rcu_nocb_kthread(void *arg)
nocb_follower_wait(rdp);
/* Pull the ready-to-invoke callbacks onto local list. */
- list = ACCESS_ONCE(rdp->nocb_follower_head);
+ list = READ_ONCE(rdp->nocb_follower_head);
BUG_ON(!list);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
- ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+ WRITE_ONCE(rdp->nocb_follower_head, NULL);
tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
/* Each pass through the following loop invokes a callback. */
@@ -2327,7 +2244,7 @@ static int rcu_nocb_kthread(void *arg)
/* Is a deferred wakeup of rcu_nocb_kthread() required? */
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
- return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+ return READ_ONCE(rdp->nocb_defer_wakeup);
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
@@ -2337,8 +2254,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
- ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
- ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+ ndw = READ_ONCE(rdp->nocb_defer_wakeup);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
@@ -2392,18 +2309,8 @@ void __init rcu_init_nohz(void)
pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
for_each_rcu_flavor(rsp) {
- for_each_cpu(cpu, rcu_nocb_mask) {
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-
- /*
- * If there are early callbacks, they will need
- * to be moved to the nocb lists.
- */
- WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
- &rdp->nxtlist &&
- rdp->nxttail[RCU_NEXT_TAIL] != NULL);
- init_nocb_callback_list(rdp);
- }
+ for_each_cpu(cpu, rcu_nocb_mask)
+ init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
rcu_organize_nocb_kthreads(rsp);
}
}
@@ -2461,7 +2368,7 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
t = kthread_run(rcu_nocb_kthread, rdp_spawn,
"rcuo%c/%d", rsp->abbr, cpu);
BUG_ON(IS_ERR(t));
- ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+ WRITE_ONCE(rdp_spawn->nocb_kthread, t);
}
/*
@@ -2540,6 +2447,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
if (!rcu_is_nocb_cpu(rdp->cpu))
return false;
+ /* If there are early-boot callbacks, move them to nocb lists. */
+ if (rdp->nxtlist) {
+ rdp->nocb_head = rdp->nxtlist;
+ rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
+ atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
+ atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
+ rdp->nxtlist = NULL;
+ rdp->qlen = 0;
+ rdp->qlen_lazy = 0;
+ }
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
return true;
}
@@ -2666,7 +2583,7 @@ static void rcu_sysidle_enter(int irq)
/* Record start of fully idle period. */
j = jiffies;
- ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+ WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
smp_mb__after_atomic();
@@ -2684,7 +2601,7 @@ static void rcu_sysidle_enter(int irq)
*/
void rcu_sysidle_force_exit(void)
{
- int oldstate = ACCESS_ONCE(full_sysidle_state);
+ int oldstate = READ_ONCE(full_sysidle_state);
int newoldstate;
/*
@@ -2763,7 +2680,8 @@ static void rcu_sysidle_exit(int irq)
/*
* Check to see if the current CPU is idle. Note that usermode execution
- * does not count as idle. The caller must have disabled interrupts.
+ * does not count as idle. The caller must have disabled interrupts,
+ * and must be running on tick_do_timer_cpu.
*/
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj)
@@ -2784,8 +2702,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
if (!*isidle || rdp->rsp != rcu_state_p ||
cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
return;
- if (rcu_gp_in_progress(rdp->rsp))
- WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+ /* Verify affinity of current kthread. */
+ WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
/* Pick up current idle and NMI-nesting counter and check. */
cur = atomic_read(&rdtp->dynticks_idle);
@@ -2796,7 +2714,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
smp_mb(); /* Read counters before timestamps. */
/* Pick up timestamps. */
- j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+ j = READ_ONCE(rdtp->dynticks_idle_jiffies);
/* If this CPU entered idle more recently, update maxj timestamp. */
if (ULONG_CMP_LT(*maxj, j))
*maxj = j;
@@ -2833,11 +2751,11 @@ static unsigned long rcu_sysidle_delay(void)
static void rcu_sysidle(unsigned long j)
{
/* Check the current state. */
- switch (ACCESS_ONCE(full_sysidle_state)) {
+ switch (READ_ONCE(full_sysidle_state)) {
case RCU_SYSIDLE_NOT:
/* First time all are idle, so note a short idle period. */
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+ WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
break;
case RCU_SYSIDLE_SHORT:
@@ -2875,7 +2793,7 @@ static void rcu_sysidle_cancel(void)
{
smp_mb();
if (full_sysidle_state > RCU_SYSIDLE_SHORT)
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+ WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
}
/*
@@ -2927,7 +2845,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
smp_mb(); /* grace period precedes setting inuse. */
rshp = container_of(rhp, struct rcu_sysidle_head, rh);
- ACCESS_ONCE(rshp->inuse) = 0;
+ WRITE_ONCE(rshp->inuse, 0);
}
/*
@@ -2938,7 +2856,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
bool rcu_sys_is_idle(void)
{
static struct rcu_sysidle_head rsh;
- int rss = ACCESS_ONCE(full_sysidle_state);
+ int rss = READ_ONCE(full_sysidle_state);
if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
return false;
@@ -2966,7 +2884,7 @@ bool rcu_sys_is_idle(void)
}
rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
oldrss = rss;
- rss = ACCESS_ONCE(full_sysidle_state);
+ rss = READ_ONCE(full_sysidle_state);
}
}
@@ -3050,10 +2968,10 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(smp_processor_id()) &&
(!rcu_gp_in_progress(rsp) ||
- ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
- return 1;
+ ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
+ return true;
#endif /* #ifdef CONFIG_NO_HZ_FULL */
- return 0;
+ return false;
}
/*
@@ -3068,11 +2986,10 @@ static void rcu_bind_gp_kthread(void)
return;
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
cpu = tick_do_timer_cpu;
- if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
+ if (cpu >= 0 && cpu < nr_cpu_ids)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
- if (!is_housekeeping_cpu(raw_smp_processor_id()))
- housekeeping_affine(current);
+ housekeeping_affine(current);
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
}
@@ -3080,7 +2997,7 @@ static void rcu_bind_gp_kthread(void)
static void rcu_dynticks_task_enter(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
@@ -3088,6 +3005,6 @@ static void rcu_dynticks_task_enter(void)
static void rcu_dynticks_task_exit(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index fbb6240509ea..6fc4c5ff3bb5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v)
static int show_rcubarrier(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "bcc: %d nbd: %lu\n",
+ seq_printf(m, "bcc: %d bseq: %lu\n",
atomic_read(&rsp->barrier_cpu_count),
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
return 0;
}
@@ -185,18 +185,15 @@ static int show_rcuexp(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
- atomic_long_read(&rsp->expedited_start),
- atomic_long_read(&rsp->expedited_done),
- atomic_long_read(&rsp->expedited_wrap),
- atomic_long_read(&rsp->expedited_tryfail),
+ seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+ rsp->expedited_sequence,
+ atomic_long_read(&rsp->expedited_workdone0),
atomic_long_read(&rsp->expedited_workdone1),
atomic_long_read(&rsp->expedited_workdone2),
+ atomic_long_read(&rsp->expedited_workdone3),
atomic_long_read(&rsp->expedited_normal),
- atomic_long_read(&rsp->expedited_stoppedcpus),
- atomic_long_read(&rsp->expedited_done_tries),
- atomic_long_read(&rsp->expedited_done_lost),
- atomic_long_read(&rsp->expedited_done_exit));
+ atomic_read(&rsp->expedited_need_qs),
+ rsp->expedited_sequence / 2);
return 0;
}
@@ -277,14 +274,14 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+ READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
level = rnp->level;
}
- seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
- rnp->qsmask, rnp->qsmaskinit,
+ seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
+ rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
".G"[rnp->gp_tasks != NULL],
".E"[rnp->exp_tasks != NULL],
".T"[!list_empty(&rnp->blkd_tasks)],
@@ -323,8 +320,8 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
struct rcu_node *rnp = &rsp->node[0];
raw_spin_lock_irqsave(&rnp->lock, flags);
- completed = ACCESS_ONCE(rsp->completed);
- gpnum = ACCESS_ONCE(rsp->gpnum);
+ completed = READ_ONCE(rsp->completed);
+ gpnum = READ_ONCE(rsp->gpnum);
if (completed == gpnum)
gpage = 0;
else
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e0d31a345ee6..7a0b3bc7c5ed 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,112 @@ MODULE_ALIAS("rcupdate");
module_param(rcu_expedited, int, 0);
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
+/**
+ * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * RCU-sched read-side critical section. In absence of
+ * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
+ * critical section unless it can prove otherwise. Note that disabling
+ * of preemption (including disabling irqs) counts as an RCU-sched
+ * read-side critical section. This is useful for debug checks in functions
+ * that required that they be called within an RCU-sched read-side
+ * critical section.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of
+ * view (ie: that we are in the section between rcu_idle_enter() and
+ * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
+ * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
+ * that are in such a section, considering these as in extended quiescent
+ * state, so such a CPU is effectively never in an RCU read-side critical
+ * section regardless of what RCU primitives it invokes. This state of
+ * affairs is required --- we need to keep an RCU-free window in idle
+ * where the CPU may possibly enter into low power mode. This way we can
+ * notice an extended quiescent state to other CPUs that started a grace
+ * period. Otherwise we would delay any grace period as long as we run in
+ * the idle task.
+ *
+ * Similarly, we avoid claiming an SRCU read lock held if the current
+ * CPU is offline.
+ */
+int rcu_read_lock_sched_held(void)
+{
+ int lockdep_opinion = 0;
+
+ if (!debug_lockdep_rcu_enabled())
+ return 1;
+ if (!rcu_is_watching())
+ return 0;
+ if (!rcu_lockdep_current_cpu_online())
+ return 0;
+ if (debug_locks)
+ lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
+ return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
+}
+EXPORT_SYMBOL(rcu_read_lock_sched_held);
+#endif
+
+#ifndef CONFIG_TINY_RCU
+
+static atomic_t rcu_expedited_nesting =
+ ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
+
+/*
+ * Should normal grace-period primitives be expedited? Intended for
+ * use within RCU. Note that this function takes the rcu_expedited
+ * sysfs/boot variable into account as well as the rcu_expedite_gp()
+ * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
+ * returns false is a -really- bad idea.
+ */
+bool rcu_gp_is_expedited(void)
+{
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
+
+/**
+ * rcu_expedite_gp - Expedite future RCU grace periods
+ *
+ * After a call to this function, future calls to synchronize_rcu() and
+ * friends act as the corresponding synchronize_rcu_expedited() function
+ * had instead been called.
+ */
+void rcu_expedite_gp(void)
+{
+ atomic_inc(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_expedite_gp);
+
+/**
+ * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
+ *
+ * Undo a prior call to rcu_expedite_gp(). If all prior calls to
+ * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
+ * and if the rcu_expedited sysfs/boot parameter is not set, then all
+ * subsequent calls to synchronize_rcu() and friends will return to
+ * their normal non-expedited behavior.
+ */
+void rcu_unexpedite_gp(void)
+{
+ atomic_dec(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
+
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+/*
+ * Inform RCU of the end of the in-kernel boot sequence.
+ */
+void rcu_end_inkernel_boot(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
+ rcu_unexpedite_gp();
+}
+
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -93,14 +199,14 @@ void __rcu_read_unlock(void)
barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
+ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
barrier(); /* ->rcu_read_unlock_special load before assign */
t->rcu_read_lock_nesting = 0;
}
#ifdef CONFIG_PROVE_LOCKING
{
- int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+ int rrln = READ_ONCE(t->rcu_read_lock_nesting);
WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
}
@@ -199,36 +305,50 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-struct rcu_synchronize {
- struct rcu_head head;
- struct completion completion;
-};
-
-/*
- * Awaken the corresponding synchronize_rcu() instance now that a
- * grace period has elapsed.
+/**
+ * wakeme_after_rcu() - Callback function to awaken a task after grace period
+ * @head: Pointer to rcu_head member within rcu_synchronize structure
+ *
+ * Awaken the corresponding task now that a grace period has elapsed.
*/
-static void wakeme_after_rcu(struct rcu_head *head)
+void wakeme_after_rcu(struct rcu_head *head)
{
struct rcu_synchronize *rcu;
rcu = container_of(head, struct rcu_synchronize, head);
complete(&rcu->completion);
}
+EXPORT_SYMBOL_GPL(wakeme_after_rcu);
-void wait_rcu_gp(call_rcu_func_t crf)
+void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+ struct rcu_synchronize *rs_array)
{
- struct rcu_synchronize rcu;
+ int i;
+
+ /* Initialize and register callbacks for each flavor specified. */
+ for (i = 0; i < n; i++) {
+ if (checktiny &&
+ (crcu_array[i] == call_rcu ||
+ crcu_array[i] == call_rcu_bh)) {
+ might_sleep();
+ continue;
+ }
+ init_rcu_head_on_stack(&rs_array[i].head);
+ init_completion(&rs_array[i].completion);
+ (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+ }
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- crf(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
+ /* Wait for all callbacks to be invoked. */
+ for (i = 0; i < n; i++) {
+ if (checktiny &&
+ (crcu_array[i] == call_rcu ||
+ crcu_array[i] == call_rcu_bh))
+ continue;
+ wait_for_completion(&rs_array[i].completion);
+ destroy_rcu_head_on_stack(&rs_array[i].head);
+ }
}
-EXPORT_SYMBOL_GPL(wait_rcu_gp);
+EXPORT_SYMBOL_GPL(__wait_rcu_gp);
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head)
@@ -335,17 +455,17 @@ module_param(rcu_cpu_stall_timeout, int, 0644);
int rcu_jiffies_till_stall_check(void)
{
- int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+ int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
/*
* Limit check must be consistent with the Kconfig limits
* for CONFIG_RCU_CPU_STALL_TIMEOUT.
*/
if (till_stall_check < 3) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+ WRITE_ONCE(rcu_cpu_stall_timeout, 3);
till_stall_check = 3;
} else if (till_stall_check > 300) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+ WRITE_ONCE(rcu_cpu_stall_timeout, 300);
till_stall_check = 300;
}
return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
@@ -469,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
void synchronize_rcu_tasks(void)
{
/* Complain if the scheduler has not started. */
- rcu_lockdep_assert(!rcu_scheduler_active,
- "synchronize_rcu_tasks called too soon");
+ RCU_LOCKDEP_WARN(!rcu_scheduler_active,
+ "synchronize_rcu_tasks called too soon");
/* Wait for the grace period. */
wait_rcu_gp(call_rcu_tasks);
@@ -496,12 +616,12 @@ static void check_holdout_task(struct task_struct *t,
{
int cpu;
- if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
- t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
- !ACCESS_ONCE(t->on_rq) ||
+ if (!READ_ONCE(t->rcu_tasks_holdout) ||
+ t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
+ !READ_ONCE(t->on_rq) ||
(IS_ENABLED(CONFIG_NO_HZ_FULL) &&
!is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
- ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+ WRITE_ONCE(t->rcu_tasks_holdout, false);
list_del_init(&t->rcu_tasks_holdout_list);
put_task_struct(t);
return;
@@ -585,11 +705,11 @@ static int __noreturn rcu_tasks_kthread(void *arg)
*/
rcu_read_lock();
for_each_process_thread(g, t) {
- if (t != current && ACCESS_ONCE(t->on_rq) &&
+ if (t != current && READ_ONCE(t->on_rq) &&
!is_idle_task(t)) {
get_task_struct(t);
- t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
- ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+ t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
+ WRITE_ONCE(t->rcu_tasks_holdout, true);
list_add(&t->rcu_tasks_holdout_list,
&rcu_tasks_holdouts);
}
@@ -618,7 +738,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
struct task_struct *t1;
schedule_timeout_interruptible(HZ);
- rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+ rtst = READ_ONCE(rcu_task_stall_timeout);
needreport = rtst > 0 &&
time_after(jiffies, lastreport + rtst);
if (needreport)
@@ -674,7 +794,7 @@ static void rcu_spawn_tasks_kthread(void)
static struct task_struct *rcu_tasks_kthread_ptr;
struct task_struct *t;
- if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+ if (READ_ONCE(rcu_tasks_kthread_ptr)) {
smp_mb(); /* Ensure caller sees full kthread. */
return;
}
@@ -686,7 +806,7 @@ static void rcu_spawn_tasks_kthread(void)
t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
BUG_ON(IS_ERR(t));
smp_mb(); /* Ensure others see full kthread. */
- ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+ WRITE_ONCE(rcu_tasks_kthread_ptr, t);
mutex_unlock(&rcu_tasks_kthread_mutex);
}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 5925f5ae8dff..d20c85d9f8c0 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -387,8 +387,9 @@ void ctrl_alt_del(void)
}
char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+static const char reboot_cmd[] = "/sbin/reboot";
-static int __orderly_poweroff(bool force)
+static int run_cmd(const char *cmd)
{
char **argv;
static char *envp[] = {
@@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force)
NULL
};
int ret;
-
- argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+ argv = argv_split(GFP_KERNEL, cmd, NULL);
if (argv) {
ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
argv_free(argv);
@@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force)
ret = -ENOMEM;
}
+ return ret;
+}
+
+static int __orderly_reboot(void)
+{
+ int ret;
+
+ ret = run_cmd(reboot_cmd);
+
+ if (ret) {
+ pr_warn("Failed to start orderly reboot: forcing the issue\n");
+ emergency_sync();
+ kernel_restart(NULL);
+ }
+
+ return ret;
+}
+
+static int __orderly_poweroff(bool force)
+{
+ int ret;
+
+ ret = run_cmd(poweroff_cmd);
+
if (ret && force) {
pr_warn("Failed to start orderly shutdown: forcing the issue\n");
+
/*
* I guess this should try to kick off some daemon to sync and
* poweroff asap. Or not even bother syncing if we're doing an
@@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func);
* This may be called from any context to trigger a system shutdown.
* If the orderly shutdown fails, it will force an immediate shutdown.
*/
-int orderly_poweroff(bool force)
+void orderly_poweroff(bool force)
{
if (force) /* do not override the pending "true" */
poweroff_force = true;
schedule_work(&poweroff_work);
- return 0;
}
EXPORT_SYMBOL_GPL(orderly_poweroff);
+static void reboot_work_func(struct work_struct *work)
+{
+ __orderly_reboot();
+}
+
+static DECLARE_WORK(reboot_work, reboot_work_func);
+
+/**
+ * orderly_reboot - Trigger an orderly system reboot
+ *
+ * This may be called from any context to trigger a system reboot.
+ * If the orderly reboot fails, it will force an immediate reboot.
+ */
+void orderly_reboot(void)
+{
+ schedule_work(&reboot_work);
+}
+EXPORT_SYMBOL_GPL(orderly_reboot);
+
static int __init reboot_setup(char *str)
{
for (;;) {
diff --git a/kernel/relay.c b/kernel/relay.c
index 5a56d3c8dc03..0b4570cfacae 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -81,10 +81,7 @@ static struct page **relay_alloc_page_array(unsigned int n_pages)
*/
static void relay_free_page_array(struct page **array)
{
- if (is_vmalloc_addr(array))
- vfree(array);
- else
- kfree(array);
+ kvfree(array);
}
/**
@@ -407,7 +404,7 @@ static inline void relay_set_buf_dentry(struct rchan_buf *buf,
struct dentry *dentry)
{
buf->dentry = dentry;
- buf->dentry->d_inode->i_size = buf->early_bytes;
+ d_inode(buf->dentry)->i_size = buf->early_bytes;
}
static struct dentry *relay_create_buf_file(struct rchan *chan,
@@ -733,7 +730,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
buf->padding[old_subbuf] = buf->prev_padding;
buf->subbufs_produced++;
if (buf->dentry)
- buf->dentry->d_inode->i_size +=
+ d_inode(buf->dentry)->i_size +=
buf->chan->subbuf_size -
buf->padding[old_subbuf];
else
diff --git a/kernel/resource.c b/kernel/resource.c
index 19f2357dfda3..fed052a1bc9f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -504,13 +504,13 @@ int region_is_ram(resource_size_t start, unsigned long size)
{
struct resource *p;
resource_size_t end = start + size - 1;
- int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
const char *name = "System RAM";
int ret = -1;
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
- if (end < p->start)
+ if (p->end < start)
continue;
if (p->start <= start && end <= p->end) {
@@ -521,7 +521,7 @@ int region_is_ram(resource_size_t start, unsigned long size)
ret = 1;
break;
}
- if (p->end < start)
+ if (end < p->start)
break; /* not found */
}
read_unlock(&resource_lock);
@@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res)
*
* request_region creates a new busy region.
*
- * check_region returns non-zero if the area is already busy.
- *
* release_region releases a matching busy region.
*/
@@ -1098,36 +1096,6 @@ struct resource * __request_region(struct resource *parent,
EXPORT_SYMBOL(__request_region);
/**
- * __check_region - check if a resource region is busy or free
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- *
- * Returns 0 if the region is free at the moment it is checked,
- * returns %-EBUSY if the region is busy.
- *
- * NOTE:
- * This function is deprecated because its use is racy.
- * Even if it returns 0, a subsequent call to request_region()
- * may fail because another driver etc. just allocated the region.
- * Do NOT use it. It will be removed from the kernel.
- */
-int __check_region(struct resource *parent, resource_size_t start,
- resource_size_t n)
-{
- struct resource * res;
-
- res = __request_region(parent, start, n, "check-region", 0);
- if (!res)
- return -EBUSY;
-
- release_resource(res);
- free_resource(res);
- return 0;
-}
-EXPORT_SYMBOL(__check_region);
-
-/**
* __release_region - release a previously reserved resource region
* @parent: parent resource descriptor
* @start: resource start address
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be87024875..67687973ce80 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd669d..750ed601ddf7 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-
#include "sched.h"
#include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
p->signal->autogroup = autogroup_kref_get(ag);
- if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ if (!READ_ONCE(sysctl_sched_autogroup_enabled))
goto out;
for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif /* CONFIG_SCHED_DEBUG */
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
- int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+ int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62671f53202a..3595403921bd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,26 +90,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
- unsigned long delta;
- ktime_t soft, hard, now;
-
- for (;;) {
- if (hrtimer_active(period_timer))
- break;
-
- now = hrtimer_cb_get_time(period_timer);
- hrtimer_forward(period_timer, now, period);
-
- soft = hrtimer_get_softexpires(period_timer);
- hard = hrtimer_get_expires(period_timer);
- delta = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(period_timer, soft, delta,
- HRTIMER_MODE_ABS_PINNED, 0);
- }
-}
-
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -184,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
static void sched_feat_disable(int i)
{
- if (static_key_enabled(&sched_feat_keys[i]))
- static_key_slow_dec(&sched_feat_keys[i]);
+ static_key_disable(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
- if (!static_key_enabled(&sched_feat_keys[i]))
- static_key_slow_inc(&sched_feat_keys[i]);
+ static_key_enable(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
@@ -306,6 +284,9 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
+
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
@@ -352,12 +333,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
#ifdef CONFIG_SMP
-static int __hrtick_restart(struct rq *rq)
+static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time = hrtimer_get_softexpires(timer);
- return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
}
/*
@@ -437,8 +417,8 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense. Rely on vruntime for fairness.
*/
delay = max_t(u64, delay, 10000LL);
- __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
- HRTIMER_MODE_REL_PINNED, 0);
+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL_PINNED);
}
static inline void init_hrtick(void)
@@ -508,7 +488,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
@@ -538,6 +518,52 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ struct wake_q_node *node = &task->wake_q;
+
+ /*
+ * Atomically grab the task, if ->wake_q is !nil already it means
+ * its already queued (either by us or someone else) and will get the
+ * wakeup due to that.
+ *
+ * This cmpxchg() implies a full barrier, which pairs with the write
+ * barrier implied by the wakeup in wake_up_list().
+ */
+ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+ return;
+
+ get_task_struct(task);
+
+ /*
+ * The head is context local, there can be no concurrency.
+ */
+ *head->lastp = node;
+ head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+ struct wake_q_node *node = head->first;
+
+ while (node != WAKE_Q_TAIL) {
+ struct task_struct *task;
+
+ task = container_of(node, struct task_struct, wake_q);
+ BUG_ON(!task);
+ /* task can safely be re-inserted now */
+ node = node->next;
+ task->wake_q.next = NULL;
+
+ /*
+ * wake_up_process() implies a wmb() to pair with the queueing
+ * in wake_q_add() so as not to miss wakeups.
+ */
+ wake_up_process(task);
+ put_task_struct(task);
+ }
+}
+
/*
* resched_curr - mark rq's current task 'to be rescheduled now'.
*
@@ -590,13 +616,12 @@ void resched_cpu(int cpu)
* selecting an idle cpu will add more delays to the timers than intended
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
*/
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
{
- int cpu = smp_processor_id();
- int i;
+ int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+ if (!idle_cpu(cpu))
return cpu;
rcu_read_lock();
@@ -690,6 +715,23 @@ static inline bool got_nohz_idle_kick(void)
bool sched_can_stop_tick(void)
{
/*
+ * FIFO realtime policy runs the highest priority task. Other runnable
+ * tasks are of a lower priority. The scheduler tick does nothing.
+ */
+ if (current->policy == SCHED_FIFO)
+ return true;
+
+ /*
+ * Round-robin realtime tasks time slice with other tasks at the same
+ * realtime priority. Is this task the only one at this priority?
+ */
+ if (current->policy == SCHED_RR) {
+ struct sched_rt_entity *rt_se = &current->rt;
+
+ return rt_se->run_list.prev == rt_se->run_list.next;
+ }
+
+ /*
* More than one running task need preemption.
* nr_running update is assumed to be visible
* after IPI is sent from wakers.
@@ -956,7 +998,11 @@ inline int task_curr(const struct task_struct *p)
}
/*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
*/
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
@@ -965,7 +1011,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
- /* Possble rq->lock 'hole'. */
+
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -997,6 +1043,222 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ lockdep_assert_held(&rq->lock);
+
+ dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, new_cpu);
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(new_cpu);
+
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_cpu(p) != new_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+
+ return rq;
+}
+
+struct migration_arg {
+ struct task_struct *task;
+ int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+{
+ if (unlikely(!cpu_active(dest_cpu)))
+ return rq;
+
+ /* Affinity changed (again). */
+ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return rq;
+
+ rq = move_queued_task(rq, p, dest_cpu);
+
+ return rq;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+ struct migration_arg *arg = data;
+ struct task_struct *p = arg->task;
+ struct rq *rq = this_rq();
+
+ /*
+ * The original target cpu might have gone down and we might
+ * be on another cpu but it doesn't matter.
+ */
+ local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
+
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ /*
+ * If task_rq(p) != rq, it cannot be migrated here, because we're
+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+ * we're holding p->pi_lock.
+ */
+ if (task_rq(p) == rq && task_on_rq_queued(p))
+ rq = __migrate_task(rq, p, arg->dest_cpu);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(&p->pi_lock);
+
+ local_irq_enable();
+ return 0;
+}
+
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+{
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct rq *rq = task_rq(p);
+ bool queued, running;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+
+ if (queued) {
+ /*
+ * Because __kthread_bind() calls this on blocked tasks without
+ * holding rq->lock.
+ */
+ lockdep_assert_held(&rq->lock);
+ dequeue_task(rq, p, 0);
+ }
+ if (running)
+ put_prev_task(rq, p);
+
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (queued)
+ enqueue_task(rq, p, 0);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ unsigned long flags;
+ struct rq *rq;
+ unsigned int dest_cpu;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &flags);
+
+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
+
+ if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ do_set_cpus_allowed(p, new_mask);
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+ struct migration_arg arg = { p, dest_cpu };
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, p, &flags);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ return 0;
+ } else if (task_on_rq_queued(p)) {
+ /*
+ * OK, since we're going to drop the lock immediately
+ * afterwards anyway.
+ */
+ lockdep_unpin_lock(&rq->lock);
+ rq = move_queued_task(rq, p, dest_cpu);
+ lockdep_pin_lock(&rq->lock);
+ }
+out:
+ task_rq_unlock(rq, p, &flags);
+
+ return ret;
+}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, new_mask, false);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -1029,7 +1291,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+ perf_event_task_migrate(p);
}
__set_task_cpu(p, new_cpu);
@@ -1137,13 +1399,6 @@ out:
return ret;
}
-struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -1276,9 +1531,7 @@ void kick_process(struct task_struct *p)
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
@@ -1358,6 +1611,8 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
+ lockdep_assert_held(&p->pi_lock);
+
if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
@@ -1383,7 +1638,16 @@ static void update_avg(u64 *avg, u64 sample)
s64 diff = sample - *avg;
*avg += diff >> 3;
}
-#endif
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ return set_cpus_allowed_ptr(p, new_mask);
+}
+
+#endif /* CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -1442,12 +1706,19 @@ static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
check_preempt_curr(rq, p, wake_flags);
- trace_sched_wakeup(p, true);
-
p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+
#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
+ if (p->sched_class->task_woken) {
+ /*
+ * Our task @p is fully woken up and running; so its safe to
+ * drop the rq->lock, hereafter rq is only used for statistics.
+ */
+ lockdep_unpin_lock(&rq->lock);
p->sched_class->task_woken(rq, p);
+ lockdep_pin_lock(&rq->lock);
+ }
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -1466,6 +1737,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
{
+ lockdep_assert_held(&rq->lock);
+
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
@@ -1510,6 +1783,7 @@ void sched_ttwu_pending(void)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
+ lockdep_pin_lock(&rq->lock);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1517,6 +1791,7 @@ void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1613,7 +1888,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
#endif
raw_spin_lock(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
ttwu_do_activate(rq, p, 0);
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
}
@@ -1649,6 +1926,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (!(p->state & state))
goto out;
+ trace_sched_waking(p);
+
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
@@ -1708,14 +1987,24 @@ static void try_to_wake_up_local(struct task_struct *p)
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet picked a replacement task.
+ */
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
}
if (!(p->state & TASK_NORMAL))
goto out;
+ trace_sched_waking(p);
+
if (!task_on_rq_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
@@ -1783,9 +2072,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
-#ifdef CONFIG_SMP
- p->se.avg.decay_count = 0;
-#endif
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
@@ -1931,7 +2217,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
set_task_cpu(p, cpu);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
@@ -1967,8 +2253,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
#ifdef CONFIG_SMP
inline struct dl_bw *dl_bw_of(int i)
{
- rcu_lockdep_assert(rcu_read_lock_sched_held(),
- "sched RCU must be held");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
return &cpu_rq(i)->rd->dl_bw;
}
@@ -1977,8 +2263,8 @@ static inline int dl_bw_cpus(int i)
struct root_domain *rd = cpu_rq(i)->rd;
int cpus = 0;
- rcu_lockdep_assert(rcu_read_lock_sched_held(),
- "sched RCU must be held");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
@@ -2070,11 +2356,11 @@ void wake_up_new_task(struct task_struct *p)
#endif
/* Initialize new task's runnable average */
- init_task_runnable_average(p);
+ init_entity_runnable_average(&p->se);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
- trace_sched_wakeup_new(p, true);
+ trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
@@ -2085,12 +2371,29 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
+void preempt_notifier_inc(void)
+{
+ static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+ static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
/**
* preempt_notifier_register - tell me when current is being preempted & rescheduled
* @notifier: notifier struct to register
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
+ if (!static_key_false(&preempt_notifier_key))
+ WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
hlist_add_head(&notifier->link, &current->preempt_notifiers);
}
EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2099,7 +2402,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
* preempt_notifier_unregister - no longer interested in preemption notifications
* @notifier: notifier struct to unregister
*
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
*/
void preempt_notifier_unregister(struct preempt_notifier *notifier)
{
@@ -2107,7 +2410,7 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier)
}
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
@@ -2115,9 +2418,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_in_preempt_notifiers(curr);
+}
+
static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
- struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
{
struct preempt_notifier *notifier;
@@ -2125,13 +2434,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
notifier->ops->sched_out(notifier, next);
}
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_out_preempt_notifiers(curr, next);
+}
+
#else /* !CONFIG_PREEMPT_NOTIFIERS */
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
}
-static void
+static inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
@@ -2205,7 +2522,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
*/
prev_state = prev->state;
vtime_task_switch(prev);
- finish_arch_switch(prev);
perf_event_task_sched_in(prev, current);
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -2225,30 +2541,42 @@ static struct rq *finish_task_switch(struct task_struct *prev)
put_task_struct(prev);
}
- tick_nohz_task_switch(current);
+ tick_nohz_task_switch();
return rq;
}
#ifdef CONFIG_SMP
/* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
{
- if (rq->post_schedule) {
- unsigned long flags;
+ struct callback_head *head, *next;
+ void (*func)(struct rq *rq);
+ unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->curr->sched_class->post_schedule)
- rq->curr->sched_class->post_schedule(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ head = rq->balance_callback;
+ rq->balance_callback = NULL;
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
- rq->post_schedule = 0;
+ func(rq);
}
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+ if (unlikely(rq->balance_callback))
+ __balance_callback(rq);
}
#else
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
{
}
@@ -2266,7 +2594,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
/* finish_task_switch() drops rq->lock and enables preemtion */
preempt_disable();
rq = finish_task_switch(prev);
- post_schedule(rq);
+ balance_callback(rq);
preempt_enable();
if (current->set_child_tid)
@@ -2310,9 +2638,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
+ lockdep_unpin_lock(&rq->lock);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
- context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
@@ -2377,9 +2705,9 @@ unsigned long nr_iowait_cpu(int cpu)
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
- struct rq *this = this_rq();
- *nr_waiters = atomic_read(&this->nr_iowait);
- *load = this->cpu_load[0];
+ struct rq *rq = this_rq();
+ *nr_waiters = atomic_read(&rq->nr_iowait);
+ *load = rq->load.weight;
}
#ifdef CONFIG_SMP
@@ -2477,6 +2805,7 @@ void scheduler_tick(void)
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
+ calc_global_load_tick(rq);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
@@ -2505,7 +2834,7 @@ void scheduler_tick(void)
u64 scheduler_tick_max_deferment(void)
{
struct rq *rq = this_rq();
- unsigned long next, now = ACCESS_ONCE(jiffies);
+ unsigned long next, now = READ_ONCE(jiffies);
next = rq->last_sched_tick + HZ;
@@ -2706,9 +3035,7 @@ again:
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
*
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
*/
static void __sched __schedule(void)
{
@@ -2717,7 +3044,6 @@ static void __sched __schedule(void)
struct rq *rq;
int cpu;
- preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_note_context_switch();
@@ -2735,6 +3061,7 @@ static void __sched __schedule(void)
*/
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -2777,12 +3104,12 @@ static void __sched __schedule(void)
rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
- } else
+ } else {
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
+ }
- post_schedule(rq);
-
- sched_preempt_enable_no_resched();
+ balance_callback(rq);
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -2803,7 +3130,9 @@ asmlinkage __visible void __sched schedule(void)
sched_submit_work(tsk);
do {
+ preempt_disable();
__schedule();
+ sched_preempt_enable_no_resched();
} while (need_resched());
}
EXPORT_SYMBOL(schedule);
@@ -2818,7 +3147,7 @@ asmlinkage __visible void __sched schedule_user(void)
* we find a better solution.
*
* NB: There are buggy callers of this function. Ideally we
- * should warn if prev_state != IN_USER, but that will trigger
+ * should warn if prev_state != CONTEXT_USER, but that will trigger
* too frequently to make sense yet.
*/
enum ctx_state prev_state = exception_enter();
@@ -2842,15 +3171,14 @@ void __sched schedule_preempt_disabled(void)
static void __sched notrace preempt_schedule_common(void)
{
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_active_enter();
__schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ preempt_active_exit();
/*
* Check again in case we missed a preemption opportunity
* between schedule and now.
*/
- barrier();
} while (need_resched());
}
@@ -2874,9 +3202,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
/**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
*
* The tracing infrastructure uses preempt_enable_notrace to prevent
* recursion and tracing preempt enabling caused by the tracing
@@ -2889,7 +3216,7 @@ EXPORT_SYMBOL(preempt_schedule);
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
{
enum ctx_state prev_ctx;
@@ -2897,7 +3224,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
return;
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Use raw __prempt_count() ops that don't call function.
+ * We can't call functions before disabling preemption which
+ * disarm preemption tracing recursions.
+ */
+ __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ barrier();
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
@@ -2907,12 +3240,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
__schedule();
exception_exit(prev_ctx);
- __preempt_count_sub(PREEMPT_ACTIVE);
barrier();
+ __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
} while (need_resched());
}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#endif /* CONFIG_PREEMPT */
@@ -2932,17 +3264,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
prev_state = exception_enter();
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_active_enter();
local_irq_enable();
__schedule();
local_irq_disable();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
+ preempt_active_exit();
} while (need_resched());
exception_exit(prev_state);
@@ -3020,7 +3346,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- p->dl.dl_throttled = 0;
enqueue_flag = ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
@@ -3048,7 +3373,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
+ preempt_disable(); /* avoid rq from going away on us */
__task_rq_unlock(rq);
+
+ balance_callback(rq);
+ preempt_enable();
}
#endif
@@ -3280,15 +3609,18 @@ static void __setscheduler_params(struct task_struct *p,
/* Actually do priority change: must hold pi & rq lock. */
static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr)
+ const struct sched_attr *attr, bool keep_boost)
{
__setscheduler_params(p, attr);
/*
- * If we get here, there was no pi waiters boosting the
- * task. It is safe to use the normal prio.
+ * Keep a potential priority boosting if called from
+ * sched_setscheduler().
*/
- p->prio = normal_prio(p);
+ if (keep_boost)
+ p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
+ else
+ p->prio = normal_prio(p);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -3383,12 +3715,12 @@ static bool dl_param_changed(struct task_struct *p,
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
- bool user)
+ bool user, bool pi)
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, queued, running;
- int policy = attr->sched_policy;
+ int new_effective_prio, policy = attr->sched_policy;
unsigned long flags;
const struct sched_class *prev_class;
struct rq *rq;
@@ -3569,19 +3901,20 @@ change:
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
- /*
- * Special case for priority boosted tasks.
- *
- * If the new priority is lower or equal (user space view)
- * than the current (boosted) priority, we just store the new
- * normal parameters and do not touch the scheduler class and
- * the runqueue. This will be done when the task deboost
- * itself.
- */
- if (rt_mutex_check_prio(p, newprio)) {
- __setscheduler_params(p, attr);
- task_rq_unlock(rq, p, &flags);
- return 0;
+ if (pi) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ if (new_effective_prio == oldprio) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
}
queued = task_on_rq_queued(p);
@@ -3592,7 +3925,7 @@ change:
put_prev_task(rq, p);
prev_class = p->sched_class;
- __setscheduler(rq, p, attr);
+ __setscheduler(rq, p, attr, pi);
if (running)
p->sched_class->set_curr_task(rq);
@@ -3605,9 +3938,17 @@ change:
}
check_class_changed(rq, p, prev_class, oldprio);
+ preempt_disable(); /* avoid rq from going away on us */
task_rq_unlock(rq, p, &flags);
- rt_mutex_adjust_pi(p);
+ if (pi)
+ rt_mutex_adjust_pi(p);
+
+ /*
+ * Run balance callbacks after we've adjusted the PI chain.
+ */
+ balance_callback(rq);
+ preempt_enable();
return 0;
}
@@ -3628,7 +3969,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
attr.sched_policy = policy;
}
- return __sched_setscheduler(p, &attr, check);
+ return __sched_setscheduler(p, &attr, check, true);
}
/**
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3649,7 +3990,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
- return __sched_setscheduler(p, attr, true);
+ return __sched_setscheduler(p, attr, true, true);
}
EXPORT_SYMBOL_GPL(sched_setattr);
@@ -4051,7 +4392,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = set_cpus_allowed_ptr(p, new_mask);
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
@@ -4203,7 +4544,7 @@ SYSCALL_DEFINE0(sched_yield)
int __sched _cond_resched(void)
{
- if (should_resched()) {
+ if (should_resched(0)) {
preempt_schedule_common();
return 1;
}
@@ -4221,7 +4562,7 @@ EXPORT_SYMBOL(_cond_resched);
*/
int __cond_resched_lock(spinlock_t *lock)
{
- int resched = should_resched();
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
int ret = 0;
lockdep_assert_held(lock);
@@ -4243,7 +4584,7 @@ int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- if (should_resched()) {
+ if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
local_bh_enable();
preempt_schedule_common();
local_bh_disable();
@@ -4367,10 +4708,7 @@ long __sched io_schedule_timeout(long timeout)
long ret;
current->in_iowait = 1;
- if (old_iowait)
- blk_schedule_flush_plug(current);
- else
- blk_flush_plug(current);
+ blk_schedule_flush_plug(current);
delayacct_blkio_start();
rq = raw_rq();
@@ -4579,7 +4917,8 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_lock(&rq->lock);
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
@@ -4605,7 +4944,8 @@ void init_idle(struct task_struct *idle, int cpu)
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
@@ -4700,149 +5040,6 @@ out:
}
#ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
- struct rq *rq = task_rq(p);
-
- lockdep_assert_held(&rq->lock);
-
- dequeue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, new_cpu);
- raw_spin_unlock(&rq->lock);
-
- rq = cpu_rq(new_cpu);
-
- raw_spin_lock(&rq->lock);
- BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
- enqueue_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
-
- return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- * stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- * off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- * it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- * is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
- unsigned long flags;
- struct rq *rq;
- unsigned int dest_cpu;
- int ret = 0;
-
- rq = task_rq_lock(p, &flags);
-
- if (cpumask_equal(&p->cpus_allowed, new_mask))
- goto out;
-
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
- ret = -EINVAL;
- goto out;
- }
-
- do_set_cpus_allowed(p, new_mask);
-
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
- return 0;
- } else if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-out:
- task_rq_unlock(rq, p, &flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
- struct rq *rq;
- int ret = 0;
-
- if (unlikely(!cpu_active(dest_cpu)))
- return ret;
-
- rq = cpu_rq(src_cpu);
-
- raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- /* Already moved. */
- if (task_cpu(p) != src_cpu)
- goto done;
-
- /* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- goto fail;
-
- /*
- * If we're not on a rq, the next wake-up will ensure we're
- * placed properly.
- */
- if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-done:
- ret = 1;
-fail:
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock(&p->pi_lock);
- return ret;
-}
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
@@ -4890,35 +5087,9 @@ void sched_setnuma(struct task_struct *p, int nid)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
- struct migration_arg *arg = data;
-
- /*
- * The original target cpu might have gone down and we might
- * be on another cpu but it doesn't matter.
- */
- local_irq_disable();
- /*
- * We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
- * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
- */
- sched_ttwu_pending();
- __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
- local_irq_enable();
- return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Ensures that the idle task is using init_mm right before its cpu goes
* offline.
@@ -4974,9 +5145,9 @@ static struct task_struct fake_task = {
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
*/
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
{
- struct rq *rq = cpu_rq(dead_cpu);
+ struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
int dest_cpu;
@@ -4998,7 +5169,7 @@ static void migrate_tasks(unsigned int dead_cpu)
*/
update_rq_clock(rq);
- for ( ; ; ) {
+ for (;;) {
/*
* There's this thread running, bail when that's the only
* remaining thread.
@@ -5006,22 +5177,29 @@ static void migrate_tasks(unsigned int dead_cpu)
if (rq->nr_running == 1)
break;
+ /*
+ * Ensure rq->lock covers the entire task selection
+ * until the migration.
+ */
+ lockdep_pin_lock(&rq->lock);
next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
/* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_cpu, next);
- raw_spin_unlock(&rq->lock);
-
- __migrate_task(next, dead_cpu, dest_cpu);
-
- raw_spin_lock(&rq->lock);
+ dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+
+ lockdep_unpin_lock(&rq->lock);
+ rq = __migrate_task(rq, next, dest_cpu);
+ if (rq != dead_rq) {
+ raw_spin_unlock(&rq->lock);
+ rq = dead_rq;
+ raw_spin_lock(&rq->lock);
+ }
}
rq->stop = stop;
}
-
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5187,8 +5365,7 @@ static void register_sched_domain_sysctl(void)
/* may be called multiple times per register */
static void unregister_sched_domain_sysctl(void)
{
- if (sd_sysctl_header)
- unregister_sysctl_table(sd_sysctl_header);
+ unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
if (sd_ctl_dir[0].child)
sd_free_ctl_entry(&sd_ctl_dir[0].child);
@@ -5200,7 +5377,7 @@ static void register_sched_domain_sysctl(void)
static void unregister_sched_domain_sysctl(void)
{
}
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
static void set_rq_online(struct rq *rq)
{
@@ -5269,7 +5446,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
- migrate_tasks(cpu);
+ migrate_tasks(rq);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
break;
@@ -5295,7 +5472,7 @@ static struct notifier_block migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
@@ -5309,6 +5486,14 @@ static int sched_cpu_active(struct notifier_block *nfb,
case CPU_STARTING:
set_cpu_rq_start_time();
return NOTIFY_OK;
+ case CPU_ONLINE:
+ /*
+ * At this point a starting CPU has marked itself as online via
+ * set_cpu_online(). But it might not yet have marked itself
+ * as active, which is essential from here on.
+ *
+ * Thus, fall-through and help the starting CPU along.
+ */
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@@ -5320,36 +5505,13 @@ static int sched_cpu_active(struct notifier_block *nfb,
static int sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- unsigned long flags;
- long cpu = (long)hcpu;
- struct dl_bw *dl_b;
-
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
- set_cpu_active(cpu, false);
-
- /* explicitly allow suspend */
- if (!(action & CPU_TASKS_FROZEN)) {
- bool overflow;
- int cpus;
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
-
- if (overflow)
- return notifier_from_errno(-EBUSY);
- }
+ set_cpu_active((long)hcpu, false);
return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
}
-
- return NOTIFY_DONE;
}
static int __init migration_init(void)
@@ -5370,9 +5532,6 @@ static int __init migration_init(void)
return 0;
}
early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
@@ -5430,17 +5589,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- /*
- * Even though we initialize ->capacity to something semi-sane,
- * we leave capacity_orig unset. This allows us to detect if
- * domain iteration is still funny without causing /0 traps.
- */
- if (!group->sgc->capacity_orig) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
- break;
- }
-
if (!cpumask_weight(sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
@@ -5813,9 +5961,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
update_top_cache_domain(cpu);
}
-/* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
-
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
@@ -5924,7 +6069,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
@@ -6235,6 +6379,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
*/
if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -6361,8 +6506,10 @@ static void init_numa_topology_type(void)
n = sched_max_numa_distance;
- if (n <= 1)
+ if (sched_domains_numa_levels <= 1) {
sched_numa_topology_type = NUMA_DIRECT;
+ return;
+ }
for_each_online_node(a) {
for_each_online_node(b) {
@@ -6612,7 +6759,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
struct sched_group *sg;
struct sched_group_capacity *sgc;
- sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sd)
return -ENOMEM;
@@ -7000,7 +7147,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
*/
case CPU_ONLINE:
- case CPU_DOWN_FAILED:
cpuset_update_active_cpus(true);
break;
default:
@@ -7012,8 +7158,26 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
+ unsigned long flags;
+ long cpu = (long)hcpu;
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+
switch (action) {
case CPU_DOWN_PREPARE:
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+
+ if (overflow)
+ return notifier_from_errno(-EBUSY);
cpuset_update_active_cpus(false);
break;
case CPU_DOWN_PREPARE_FROZEN:
@@ -7033,6 +7197,9 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+ /* nohz_full won't take effect without isolating the cpus. */
+ tick_nohz_full_add_cpus_to(cpu_isolated_map);
+
sched_init_numa();
/*
@@ -7069,8 +7236,6 @@ void __init sched_init_smp(void)
}
#endif /* CONFIG_SMP */
-const_debug unsigned int sysctl_timer_migration = 1;
-
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -7158,8 +7323,8 @@ void __init sched_init(void)
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
- init_rt_rq(&rq->rt, rq);
- init_dl_rq(&rq->dl, rq);
+ init_rt_rq(&rq->rt);
+ init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -7199,8 +7364,8 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = SCHED_CAPACITY_SCALE;
- rq->post_schedule = 0;
+ rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
+ rq->balance_callback = NULL;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -7330,32 +7495,12 @@ EXPORT_SYMBOL(___might_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
{
- const struct sched_class *prev_class = p->sched_class;
+ struct task_struct *g, *p;
struct sched_attr attr = {
.sched_policy = SCHED_NORMAL,
};
- int old_prio = p->prio;
- int queued;
-
- queued = task_on_rq_queued(p);
- if (queued)
- dequeue_task(rq, p, 0);
- __setscheduler(rq, p, &attr);
- if (queued) {
- enqueue_task(rq, p, 0);
- resched_curr(rq);
- }
-
- check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
- struct task_struct *g, *p;
- unsigned long flags;
- struct rq *rq;
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
@@ -7382,9 +7527,7 @@ void normalize_rt_tasks(void)
continue;
}
- rq = task_rq_lock(p, &flags);
- normalize_task(rq, p);
- task_rq_unlock(rq, p, &flags);
+ __sched_setscheduler(p, &attr, false, false);
}
read_unlock(&tasklist_lock);
}
@@ -7735,11 +7878,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
return rt_runtime_us;
}
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
{
u64 rt_runtime, rt_period;
- rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+ rt_period = rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@ -7798,7 +7941,7 @@ static int sched_rt_global_constraints(void)
}
#endif /* CONFIG_RT_GROUP_SCHED */
-static int sched_dl_global_constraints(void)
+static int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
@@ -7899,11 +8042,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
if (ret)
goto undo;
- ret = sched_rt_global_constraints();
+ ret = sched_dl_global_validate();
if (ret)
goto undo;
- ret = sched_dl_global_constraints();
+ ret = sched_rt_global_constraints();
if (ret)
goto undo;
@@ -7988,7 +8131,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
{
sched_move_task(task);
}
@@ -8106,10 +8249,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
__refill_cfs_bandwidth_runtime(cfs_b);
/* restart the period timer (if active) to handle new period expiry */
- if (runtime_enabled && cfs_b->timer_active) {
- /* force a reprogram */
- __start_cfs_bandwidth(cfs_b, true);
- }
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
raw_spin_unlock_irq(&cfs_b->lock);
for_each_online_cpu(i) {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..8cbc3db671df 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
}
/*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
*
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
- cputime_t old;
-
- while (new > (old = ACCESS_ONCE(*counter)))
- cmpxchg_cputime(counter, old, new);
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer. Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ * stime + utime == rtime
+ * stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
*/
static void cputime_adjust(struct task_cputime *curr,
- struct cputime *prev,
+ struct prev_cputime *prev,
cputime_t *ut, cputime_t *st)
{
cputime_t rtime, stime, utime;
+ unsigned long flags;
- /*
- * Tick based cputime accounting depend on random scheduling
- * timeslices of a task to be interrupted or not by the timer.
- * Depending on these circumstances, the number of these interrupts
- * may be over or under-optimistic, matching the real user and system
- * cputime with a variable precision.
- *
- * Fix this by scaling these tick based values against the total
- * runtime accounted by the CFS scheduler.
- */
+ /* Serialize concurrent callers such that we can honour our guarantees */
+ raw_spin_lock_irqsave(&prev->lock, flags);
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
/*
- * Update userspace visible utime/stime values only if actual execution
- * time is bigger than already exported. Note that can happen, that we
- * provided bigger values due to scaling inaccuracy on big numbers.
+ * This is possible under two circumstances:
+ * - rtime isn't monotonic after all (a bug);
+ * - we got reordered by the lock.
+ *
+ * In both cases this acts as a filter such that the rest of the code
+ * can assume it is monotonic regardless of anything else.
*/
if (prev->stime + prev->utime >= rtime)
goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
if (utime == 0) {
stime = rtime;
- } else if (stime == 0) {
- utime = rtime;
- } else {
- cputime_t total = stime + utime;
+ goto update;
+ }
- stime = scale_stime((__force u64)stime,
- (__force u64)rtime, (__force u64)total);
- utime = rtime - stime;
+ if (stime == 0) {
+ utime = rtime;
+ goto update;
}
- cputime_advance(&prev->stime, stime);
- cputime_advance(&prev->utime, utime);
+ stime = scale_stime((__force u64)stime, (__force u64)rtime,
+ (__force u64)(stime + utime));
+
+ /*
+ * Make sure stime doesn't go backwards; this preserves monotonicity
+ * for utime because rtime is monotonic.
+ *
+ * utime_i+1 = rtime_i+1 - stime_i
+ * = rtime_i+1 - (rtime_i - utime_i)
+ * = (rtime_i+1 - rtime_i) + utime_i
+ * >= utime_i
+ */
+ if (stime < prev->stime)
+ stime = prev->stime;
+ utime = rtime - stime;
+
+ /*
+ * Make sure utime doesn't go backwards; this still preserves
+ * monotonicity for stime, analogous argument to above.
+ */
+ if (utime < prev->utime) {
+ utime = prev->utime;
+ stime = rtime - utime;
+ }
+update:
+ prev->stime = stime;
+ prev->utime = utime;
out:
*ut = prev->utime;
*st = prev->stime;
+ raw_spin_unlock_irqrestore(&prev->lock, flags);
}
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3fa8fa6d9403..fc8f01083527 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b)
dl_b->total_bw = 0;
}
-void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+void init_dl_rq(struct dl_rq *dl_rq)
{
dl_rq->rb_root = RB_ROOT;
@@ -213,9 +213,74 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
return dl_task(prev);
}
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
+
+static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
+{
+ if (!has_pushable_dl_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
{
- rq->post_schedule = has_pushable_dl_tasks(rq);
+ queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
+}
+
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
+
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+{
+ struct rq *later_rq = NULL;
+ bool fallback = false;
+
+ later_rq = find_lock_later_rq(p, rq);
+
+ if (!later_rq) {
+ int cpu;
+
+ /*
+ * If we cannot preempt any rq, fall back to pick any
+ * online cpu.
+ */
+ fallback = true;
+ cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+ if (cpu >= nr_cpu_ids) {
+ /*
+ * Fail to find any suitable cpu.
+ * The task will never come back!
+ */
+ BUG_ON(dl_bandwidth_enabled());
+
+ /*
+ * If admission control is disabled we
+ * try a little harder to let the task
+ * run.
+ */
+ cpu = cpumask_any(cpu_active_mask);
+ }
+ later_rq = cpu_rq(cpu);
+ double_lock_balance(rq, later_rq);
+ }
+
+ /*
+ * By now the task is replenished and enqueued; migrate it.
+ */
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, later_rq->cpu);
+ activate_task(later_rq, p, 0);
+
+ if (!fallback)
+ resched_curr(later_rq);
+
+ double_unlock_balance(later_rq, rq);
+
+ return later_rq;
}
#else
@@ -245,12 +310,15 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
return false;
}
-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
{
- return 0;
}
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
+{
+}
+
+static inline void queue_pull_task(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
@@ -452,24 +520,23 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
* actually started or not (i.e., the replenishment instant is in
* the future or in the past).
*/
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct task_struct *p)
{
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct hrtimer *timer = &dl_se->dl_timer;
+ struct rq *rq = task_rq(p);
ktime_t now, act;
- ktime_t soft, hard;
- unsigned long range;
s64 delta;
- if (boosted)
- return 0;
+ lockdep_assert_held(&rq->lock);
+
/*
* We want the timer to fire at the deadline, but considering
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
*/
act = ns_to_ktime(dl_se->deadline);
- now = hrtimer_cb_get_time(&dl_se->dl_timer);
+ now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -481,15 +548,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
if (ktime_us_delta(act, now) < 0)
return 0;
- hrtimer_set_expires(&dl_se->dl_timer, act);
-
- soft = hrtimer_get_softexpires(&dl_se->dl_timer);
- hard = hrtimer_get_expires(&dl_se->dl_timer);
- range = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
- range, HRTIMER_MODE_ABS, 0);
+ /*
+ * !enqueued will guarantee another callback; even if one is already in
+ * progress. This ensures a balanced {get,put}_task_struct().
+ *
+ * The race against __run_timer() clearing the enqueued state is
+ * harmless because we're holding task_rq()->lock, therefore the timer
+ * expiring after we've done the check will wait on its task_rq_lock()
+ * and observe our state.
+ */
+ if (!hrtimer_is_queued(timer)) {
+ get_task_struct(p);
+ hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+ }
- return hrtimer_active(&dl_se->dl_timer);
+ return 1;
}
/*
@@ -514,23 +587,39 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
unsigned long flags;
struct rq *rq;
- rq = task_rq_lock(current, &flags);
+ rq = task_rq_lock(p, &flags);
/*
- * We need to take care of several possible races here:
- *
- * - the task might have changed its scheduling policy
- * to something different than SCHED_DEADLINE
- * - the task might have changed its reservation parameters
- * (through sched_setattr())
- * - the task might have been boosted by someone else and
- * might be in the boosting/deboosting path
+ * The task might have changed its scheduling policy to something
+ * different than SCHED_DEADLINE (through switched_fromd_dl()).
+ */
+ if (!dl_task(p)) {
+ __dl_clear_params(p);
+ goto unlock;
+ }
+
+ /*
+ * This is possible if switched_from_dl() raced against a running
+ * callback that took the above !dl_task() path and we've since then
+ * switched back into SCHED_DEADLINE.
*
- * In all this cases we bail out, as the task is already
- * in the runqueue or is going to be enqueued back anyway.
+ * There's nothing to do except drop our task reference.
+ */
+ if (dl_se->dl_new)
+ goto unlock;
+
+ /*
+ * The task might have been boosted by someone else and might be in the
+ * boosting/deboosting path, its not throttled.
+ */
+ if (dl_se->dl_boosted)
+ goto unlock;
+
+ /*
+ * Spurious timer due to start_dl_timer() race; or we already received
+ * a replenishment from rt_mutex_setprio().
*/
- if (!dl_task(p) || dl_se->dl_new ||
- dl_se->dl_boosted || !dl_se->dl_throttled)
+ if (!dl_se->dl_throttled)
goto unlock;
sched_clock_tick();
@@ -560,16 +649,37 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
+
#ifdef CONFIG_SMP
/*
- * Queueing this task back might have overloaded rq,
- * check if we need to kick someone away.
+ * Perform balancing operations here; after the replenishments. We
+ * cannot drop rq->lock before this, otherwise the assertion in
+ * start_dl_timer() about not missing updates is not true.
+ *
+ * If we find that the rq the task was on is no longer available, we
+ * need to select a new rq.
+ *
+ * XXX figure out if select_task_rq_dl() deals with offline cpus.
+ */
+ if (unlikely(!rq->online))
+ rq = dl_task_offline_migration(rq, p);
+
+ /*
+ * Queueing this task back might have overloaded rq, check if we need
+ * to kick someone away.
*/
if (has_pushable_dl_tasks(rq))
push_dl_task(rq);
#endif
+
unlock:
- task_rq_unlock(rq, current, &flags);
+ task_rq_unlock(rq, p, &flags);
+
+ /*
+ * This can free the task_struct, including this hrtimer, do not touch
+ * anything related to that after this.
+ */
+ put_task_struct(p);
return HRTIMER_NORESTART;
}
@@ -583,7 +693,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
}
static
-int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
return (dl_se->runtime <= 0);
}
@@ -627,10 +737,10 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
- if (dl_runtime_exceeded(rq, dl_se)) {
+ if (dl_runtime_exceeded(dl_se)) {
dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
- if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
@@ -843,7 +953,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
/*
* Use the scheduling parameters of the top pi-waiter
- * task if we have one and its (relative) deadline is
+ * task if we have one and its (absolute) deadline is
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
@@ -914,6 +1024,12 @@ static void yield_task_dl(struct rq *rq)
}
update_rq_clock(rq);
update_curr_dl(rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq, true);
}
#ifdef CONFIG_SMP
@@ -932,7 +1048,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
/*
* If we are dealing with a -deadline task, we must
@@ -949,7 +1065,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
(p->nr_cpus_allowed > 1)) {
int target = find_later_rq(p);
- if (target != -1)
+ if (target != -1 &&
+ dl_time_before(p->dl.deadline,
+ cpu_rq(target)->dl.earliest_dl.curr))
cpu = target;
}
rcu_read_unlock();
@@ -979,8 +1097,6 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
resched_curr(rq);
}
-static int pull_dl_task(struct rq *this_rq);
-
#endif /* CONFIG_SMP */
/*
@@ -1037,7 +1153,15 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
dl_rq = &rq->dl;
if (need_pull_dl_task(rq, prev)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we're
+ * being very careful to re-start the picking loop.
+ */
+ lockdep_unpin_lock(&rq->lock);
pull_dl_task(rq);
+ lockdep_pin_lock(&rq->lock);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a stop task can slip in, in which case we need to
@@ -1071,7 +1195,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
- set_post_schedule(rq);
+ queue_push_tasks(rq);
return p;
}
@@ -1108,7 +1232,6 @@ static void task_fork_dl(struct task_struct *p)
static void task_dead_dl(struct task_struct *p)
{
- struct hrtimer *timer = &p->dl.dl_timer;
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
/*
@@ -1118,8 +1241,6 @@ static void task_dead_dl(struct task_struct *p)
/* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
-
- hrtimer_cancel(timer);
}
static void set_curr_task_dl(struct rq *rq)
@@ -1167,6 +1288,32 @@ next_node:
return NULL;
}
+/*
+ * Return the earliest pushable rq's task, which is suitable to be executed
+ * on the CPU, NULL otherwise:
+ */
+static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
+{
+ struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+ struct task_struct *p = NULL;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+next_node:
+ if (next_node) {
+ p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+
+ if (pick_dl_task(rq, p, cpu))
+ return p;
+
+ next_node = rb_next(next_node);
+ goto next_node;
+ }
+
+ return NULL;
+}
+
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
static int find_later_rq(struct task_struct *task)
@@ -1270,6 +1417,17 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
+ if (!dl_time_before(task->dl.deadline,
+ later_rq->dl.earliest_dl.curr)) {
+ /*
+ * Target rq has tasks of equal or earlier deadline,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ later_rq = NULL;
+ break;
+ }
+
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
@@ -1405,20 +1563,21 @@ out:
static void push_dl_tasks(struct rq *rq)
{
- /* Terminates as it moves a -deadline task */
+ /* push_dl_task() will return true if it moved a -deadline task */
while (push_dl_task(rq))
;
}
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
{
- int this_cpu = this_rq->cpu, ret = 0, cpu;
+ int this_cpu = this_rq->cpu, cpu;
struct task_struct *p;
+ bool resched = false;
struct rq *src_rq;
u64 dmin = LONG_MAX;
if (likely(!dl_overloaded(this_rq)))
- return 0;
+ return;
/*
* Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1451,7 +1610,7 @@ static int pull_dl_task(struct rq *this_rq)
if (src_rq->dl.dl_nr_running <= 1)
goto skip;
- p = pick_next_earliest_dl_task(src_rq, this_cpu);
+ p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
/*
* We found a task to be pulled if:
@@ -1473,7 +1632,7 @@ static int pull_dl_task(struct rq *this_rq)
src_rq->curr->dl.deadline))
goto skip;
- ret = 1;
+ resched = true;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
@@ -1486,12 +1645,8 @@ skip:
double_unlock_balance(this_rq, src_rq);
}
- return ret;
-}
-
-static void post_schedule_dl(struct rq *rq)
-{
- push_dl_tasks(rq);
+ if (resched)
+ resched_curr(this_rq);
}
/*
@@ -1502,7 +1657,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- has_pushable_dl_tasks(rq) &&
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
(rq->curr->nr_cpus_allowed < 2 ||
@@ -1514,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
{
- struct rq *rq;
struct root_domain *src_rd;
- int weight;
+ struct rq *rq;
BUG_ON(!dl_task(p));
@@ -1542,37 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
- /*
- * Update only if the task is actually running (i.e.,
- * it is on the rq AND it is not throttled).
- */
- if (!on_dl_rq(&p->dl))
- return;
-
- weight = cpumask_weight(new_mask);
-
- /*
- * Only update if the process changes its state from whether it
- * can migrate or not.
- */
- if ((p->nr_cpus_allowed > 1) == (weight > 1))
- return;
-
- /*
- * The process used to be able to migrate OR it can now migrate
- */
- if (weight <= 1) {
- if (!task_current(rq, p))
- dequeue_pushable_dl_task(rq, p);
- BUG_ON(!rq->dl.dl_nr_migratory);
- rq->dl.dl_nr_migratory--;
- } else {
- if (!task_current(rq, p))
- enqueue_pushable_dl_task(rq, p);
- rq->dl.dl_nr_migratory++;
- }
-
- update_dl_migration(&rq->dl);
+ set_cpus_allowed_common(p, new_mask);
}
/* Assumes rq->lock is held */
@@ -1596,7 +1719,7 @@ static void rq_offline_dl(struct rq *rq)
cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
-void init_sched_dl_class(void)
+void __init init_sched_dl_class(void)
{
unsigned int i;
@@ -1607,37 +1730,16 @@ void init_sched_dl_class(void)
#endif /* CONFIG_SMP */
-/*
- * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
- */
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
-{
- struct hrtimer *dl_timer = &p->dl.dl_timer;
-
- /* Nobody will change task's class if pi_lock is held */
- lockdep_assert_held(&p->pi_lock);
-
- if (hrtimer_active(dl_timer)) {
- int ret = hrtimer_try_to_cancel(dl_timer);
-
- if (unlikely(ret == -1)) {
- /*
- * Note, p may migrate OR new deadline tasks
- * may appear in rq when we are unlocking it.
- * A caller of us must be fine with that.
- */
- raw_spin_unlock(&rq->lock);
- hrtimer_cancel(dl_timer);
- raw_spin_lock(&rq->lock);
- }
- }
-}
-
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
- /* XXX we should retain the bw until 0-lag */
- cancel_dl_timer(rq, p);
- __dl_clear_params(p);
+ /*
+ * Start the deadline timer; if we switch back to dl before this we'll
+ * continue consuming our current CBS slice. If we stay outside of
+ * SCHED_DEADLINE until the deadline passes, the timer will reset the
+ * task.
+ */
+ if (!start_dl_timer(p))
+ __dl_clear_params(p);
/*
* Since this might be the only -deadline task on the rq,
@@ -1647,8 +1749,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
return;
- if (pull_dl_task(rq))
- resched_curr(rq);
+ queue_pull_task(rq);
}
/*
@@ -1657,29 +1758,16 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
- int check_resched = 1;
-
- /*
- * If p is throttled, don't consider the possibility
- * of preempting rq->curr, the check will be done right
- * after its runtime will get replenished.
- */
- if (unlikely(p->dl.dl_throttled))
- return;
-
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
- push_dl_task(rq) && rq != task_rq(p))
- /* Only reschedule if pushing failed */
- check_resched = 0;
-#endif /* CONFIG_SMP */
- if (check_resched) {
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
- else
- resched_curr(rq);
- }
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+ queue_push_tasks(rq);
+#else
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+#endif
}
}
@@ -1699,15 +1787,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
* or lowering its prio, so...
*/
if (!rq->dl.overloaded)
- pull_dl_task(rq);
+ queue_pull_task(rq);
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
* runqueue.
*/
- if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
- rq->curr == p)
+ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
#else
/*
@@ -1737,7 +1824,6 @@ const struct sched_class dl_sched_class = {
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
- .post_schedule = post_schedule_dl,
.task_woken = task_woken_dl,
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8baaf858d25c..641511771ae6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
- if (!se) {
- struct sched_avg *avg = &cpu_rq(cpu)->avg;
- P(avg->runnable_avg_sum);
- P(avg->runnable_avg_period);
+ if (!se)
return;
- }
-
PN(se->exec_start);
PN(se->vruntime);
@@ -93,10 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
- P(se->avg.runnable_avg_sum);
- P(se->avg.runnable_avg_period);
- P(se->avg.load_avg_contrib);
- P(se->avg.decay_count);
+ P(se->avg.load_avg);
+ P(se->avg.util_avg);
#endif
#undef PN
#undef P
@@ -130,15 +123,17 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
p->prio);
#ifdef CONFIG_SCHEDSTATS
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.vruntime),
+ SPLIT_NS(p->se.statistics.wait_sum),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(p->se.statistics.sum_sleep_runtime));
#else
- SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
- 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ 0LL, 0L,
+ SPLIT_NS(p->se.sum_exec_runtime),
+ 0LL, 0L);
#endif
#ifdef CONFIG_NUMA_BALANCING
- SEQ_printf(m, " %d", task_node(p));
+ SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf(m, " %s", task_group_path(task_group(p)));
@@ -154,7 +149,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m,
"\nrunnable tasks:\n"
" task PID tree-key switches prio"
- " exec-runtime sum-exec sum-sleep\n"
+ " wait-time sum-exec sum-sleep\n"
"------------------------------------------------------"
"----------------------------------------------------\n");
@@ -210,24 +205,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
+ SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
+ cfs_rq->avg.load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
- cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ cfs_rq->avg.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
+ atomic_long_read(&cfs_rq->removed_load_avg));
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
+ atomic_long_read(&cfs_rq->removed_util_avg));
#ifdef CONFIG_FAIR_GROUP_SCHED
- SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
- cfs_rq->tg_load_contrib);
- SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
- cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
+ cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
atomic_long_read(&cfs_rq->tg->load_avg));
- SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
- atomic_read(&cfs_rq->tg->runnable_avg));
#endif
#endif
#ifdef CONFIG_CFS_BANDWIDTH
- SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
- cfs_rq->tg->cfs_bandwidth.timer_active);
SEQ_printf(m, " .%-30s: %d\n", "throttled",
cfs_rq->throttled);
SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
@@ -513,11 +508,21 @@ __initcall(init_sched_debug_procfs);
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#ifdef CONFIG_NUMA_BALANCING
+void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+ unsigned long tpf, unsigned long gsf, unsigned long gpf)
+{
+ SEQ_printf(m, "numa_faults node=%d ", node);
+ SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf);
+ SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf);
+}
+#endif
+
+
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
struct mempolicy *pol;
- int node, i;
if (p->mm)
P(mm->numa_scan_seq);
@@ -529,26 +534,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
mpol_get(pol);
task_unlock(p);
- SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
-
- for_each_online_node(node) {
- for (i = 0; i < 2; i++) {
- unsigned long nr_faults = -1;
- int cpu_current, home_node;
-
- if (p->numa_faults)
- nr_faults = p->numa_faults[2*node + i];
-
- cpu_current = !i ? (task_node(p) == node) :
- (pol && node_isset(node, pol->v.nodes));
-
- home_node = (p->numa_preferred_nid == node);
-
- SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
- i, node, cpu_current, home_node, nr_faults);
- }
- }
-
+ P(numa_pages_migrated);
+ P(numa_preferred_nid);
+ P(total_numa_faults);
+ SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
+ task_node(p), task_numa_group_id(p));
+ show_numa_stats(p, m);
mpol_put(pol);
#endif
}
@@ -578,6 +569,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
+ PN(se.statistics.sum_sleep_runtime);
PN(se.statistics.wait_start);
PN(se.statistics.sleep_start);
PN(se.statistics.block_start);
@@ -635,10 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.load.weight);
#ifdef CONFIG_SMP
- P(se.avg.runnable_avg_sum);
- P(se.avg.runnable_avg_period);
- P(se.avg.load_avg_contrib);
- P(se.avg.decay_count);
+ P(se.avg.load_sum);
+ P(se.avg.util_sum);
+ P(se.avg.load_avg);
+ P(se.avg.util_avg);
+ P(se.avg.last_update_time);
#endif
P(policy);
P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 241213be507c..6e2e3483b1ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
{
- unsigned int cpus = min_t(int, num_online_cpus(), 8);
+ unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
unsigned int factor;
switch (sysctl_sched_tunable_scaling) {
@@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update);
-
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
cfs_rq->on_list = 1;
- /* We should have no load, but we need to update last_decay. */
- update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
@@ -576,7 +571,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- int factor = get_update_sysctl_factor();
+ unsigned int factor = get_update_sysctl_factor();
if (ret || !write)
return ret;
@@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
*/
static u64 __sched_period(unsigned long nr_running)
{
- u64 period = sysctl_sched_latency;
- unsigned long nr_latency = sched_nr_latency;
-
- if (unlikely(nr_running > nr_latency)) {
- period = sysctl_sched_min_granularity;
- period *= nr_running;
- }
-
- return period;
+ if (unlikely(nr_running > sched_nr_latency))
+ return nr_running * sysctl_sched_min_granularity;
+ else
+ return sysctl_sched_latency;
}
/*
@@ -669,20 +659,37 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
-static inline void __update_task_entity_contrib(struct sched_entity *se);
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-/* Give new task start runnable values to heavy its load in infant time */
-void init_task_runnable_average(struct task_struct *p)
+/* Give new sched_entity start runnable values to heavy its load in infant time */
+void init_entity_runnable_average(struct sched_entity *se)
{
- u32 slice;
+ struct sched_avg *sa = &se->avg;
- slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
- p->se.avg.runnable_avg_sum = slice;
- p->se.avg.runnable_avg_period = slice;
- __update_task_entity_contrib(&p->se);
+ sa->last_update_time = 0;
+ /*
+ * sched_avg's period_contrib should be strictly less then 1024, so
+ * we give it 1023 to make sure it is almost a period (1024us), and
+ * will definitely be update (after enqueue).
+ */
+ sa->period_contrib = 1023;
+ sa->load_avg = scale_load_down(se->load.weight);
+ sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+ sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+ sa->util_sum = LOAD_AVG_MAX;
+ /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
-void init_task_runnable_average(struct task_struct *p)
+void init_entity_runnable_average(struct sched_entity *se)
{
}
#endif
@@ -832,7 +839,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
- unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+ unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
@@ -1396,6 +1403,31 @@ static void task_numa_find_cpu(struct task_numa_env *env,
}
}
+/* Only move tasks to a NUMA node less busy than the current node. */
+static bool numa_has_capacity(struct task_numa_env *env)
+{
+ struct numa_stats *src = &env->src_stats;
+ struct numa_stats *dst = &env->dst_stats;
+
+ if (src->has_free_capacity && !dst->has_free_capacity)
+ return false;
+
+ /*
+ * Only consider a task move if the source has a higher load
+ * than the destination, corrected for CPU capacity on each node.
+ *
+ * src->load dst->load
+ * --------------------- vs ---------------------
+ * src->compute_capacity dst->compute_capacity
+ */
+ if (src->load * dst->compute_capacity * env->imbalance_pct >
+
+ dst->load * src->compute_capacity * 100)
+ return true;
+
+ return false;
+}
+
static int task_numa_migrate(struct task_struct *p)
{
struct task_numa_env env = {
@@ -1450,7 +1482,8 @@ static int task_numa_migrate(struct task_struct *p)
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
- task_numa_find_cpu(&env, taskimp, groupimp);
+ if (numa_has_capacity(&env))
+ task_numa_find_cpu(&env, taskimp, groupimp);
/*
* Look at other nodes in these cases:
@@ -1481,7 +1514,8 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
- task_numa_find_cpu(&env, taskimp, groupimp);
+ if (numa_has_capacity(&env))
+ task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@@ -1674,8 +1708,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
- delta = p->se.avg.runnable_avg_sum;
- *period = p->se.avg.runnable_avg_period;
+ delta = p->se.avg.load_sum / p->se.load.weight;
+ *period = LOAD_AVG_MAX;
}
p->last_sum_exec_runtime = runtime;
@@ -1765,6 +1799,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
}
}
/* Next round, evaluate the nodes within max_group. */
+ if (!max_faults)
+ break;
nodes = max_group;
}
return nid;
@@ -1779,7 +1815,12 @@ static void task_numa_placement(struct task_struct *p)
u64 runtime, period;
spinlock_t *group_lock = NULL;
- seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ /*
+ * The p->mm->numa_scan_seq field gets updated without
+ * exclusive access. Use READ_ONCE() here to ensure
+ * that the field is read in a single access:
+ */
+ seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
@@ -1923,7 +1964,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
}
rcu_read_lock();
- tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+ tsk = READ_ONCE(cpu_rq(cpu)->curr);
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
@@ -2092,7 +2133,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
static void reset_ptenuma_scan(struct task_struct *p)
{
- ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ /*
+ * We only did a read acquisition of the mmap sem, so
+ * p->mm->numa_scan_seq is written to without exclusive access
+ * and the update is not guaranteed to be atomic. That's not
+ * much of an issue though, since this is just used for
+ * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+ * expensive, to avoid any form of compiler optimizations:
+ */
+ WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}
@@ -2166,7 +2215,7 @@ void task_numa_work(struct callback_head *work)
}
for (; vma; vma = vma->vm_next) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
- is_vm_hugetlb_page(vma)) {
+ is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
}
@@ -2308,13 +2357,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
long tg_weight;
/*
- * Use this CPU's actual weight instead of the last load_contribution
- * to gain a more accurate current total weight. See
- * update_cfs_rq_load_contribution().
+ * Use this CPU's real-time load instead of the last load contribution
+ * as the updating of the contribution is delayed, and we will use the
+ * the real-time load to calc the share. See update_tg_load_avg().
*/
tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_contrib;
- tg_weight += cfs_rq->load.weight;
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += cfs_rq_load_avg(cfs_rq);
return tg_weight;
}
@@ -2324,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
long tg_weight, load, shares;
tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ load = cfs_rq_load_avg(cfs_rq);
shares = (tg->shares * load);
if (tg_weight)
@@ -2386,14 +2435,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-
/* Precomputed fixed inverse multiplies for multiplication by y^n */
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
@@ -2442,9 +2483,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
local_n %= LOAD_AVG_PERIOD;
}
- val *= runnable_avg_yN_inv[local_n];
- /* We don't use SRR here since we always want to round down. */
- return val >> 32;
+ val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+ return val;
}
/*
@@ -2503,21 +2543,22 @@ static u32 __compute_runnable_contrib(u64 n)
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
-static __always_inline int __update_entity_runnable_avg(u64 now,
- struct sched_avg *sa,
- int runnable)
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
u64 delta, periods;
- u32 runnable_contrib;
+ u32 contrib;
int delta_w, decayed = 0;
+ unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
- delta = now - sa->last_runnable_update;
+ delta = now - sa->last_update_time;
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
*/
if ((s64)delta < 0) {
- sa->last_runnable_update = now;
+ sa->last_update_time = now;
return 0;
}
@@ -2528,23 +2569,29 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
delta >>= 10;
if (!delta)
return 0;
- sa->last_runnable_update = now;
+ sa->last_update_time = now;
/* delta_w is the amount already accumulated against our next period */
- delta_w = sa->runnable_avg_period % 1024;
+ delta_w = sa->period_contrib;
if (delta + delta_w >= 1024) {
- /* period roll-over */
decayed = 1;
+ /* how much left for next period will start over, we don't know yet */
+ sa->period_contrib = 0;
+
/*
* Now that we know we're crossing a period boundary, figure
* out how much from delta we need to complete the current
* period and accrue it.
*/
delta_w = 1024 - delta_w;
- if (runnable)
- sa->runnable_avg_sum += delta_w;
- sa->runnable_avg_period += delta_w;
+ if (weight) {
+ sa->load_sum += weight * delta_w;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * delta_w;
+ }
+ if (running)
+ sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
delta -= delta_w;
@@ -2552,299 +2599,186 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
periods = delta / 1024;
delta %= 1024;
- sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
- periods + 1);
- sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
- periods + 1);
+ sa->load_sum = decay_load(sa->load_sum, periods + 1);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods + 1);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
- runnable_contrib = __compute_runnable_contrib(periods);
- if (runnable)
- sa->runnable_avg_sum += runnable_contrib;
- sa->runnable_avg_period += runnable_contrib;
+ contrib = __compute_runnable_contrib(periods);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
}
/* Remainder of delta accrued against u_0` */
- if (runnable)
- sa->runnable_avg_sum += delta;
- sa->runnable_avg_period += delta;
-
- return decayed;
-}
-
-/* Synchronize an entity's decay with its parenting cfs_rq.*/
-static inline u64 __synchronize_entity_decay(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 decays = atomic64_read(&cfs_rq->decay_counter);
+ if (weight) {
+ sa->load_sum += weight * delta;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * delta;
+ }
+ if (running)
+ sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
- decays -= se->avg.decay_count;
- se->avg.decay_count = 0;
- if (!decays)
- return 0;
+ sa->period_contrib += delta;
- se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ if (decayed) {
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ }
+ sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX;
+ }
- return decays;
+ return decayed;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long tg_contrib;
-
- tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
- tg_contrib -= cfs_rq->tg_load_contrib;
-
- if (!tg_contrib)
- return;
-
- if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
- atomic_long_add(tg_contrib, &tg->load_avg);
- cfs_rq->tg_load_contrib += tg_contrib;
- }
-}
-
/*
- * Aggregate cfs_rq runnable averages into an equivalent task_group
- * representation for computing load contributions.
+ * Updating tg's load_avg is necessary before update_cfs_share (which is done)
+ * and effective_load (which is not done because it is too costly).
*/
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
- struct task_group *tg = cfs_rq->tg;
- long contrib;
+ long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
- /* The fraction of a cpu used by this cfs_rq */
- contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
- sa->runnable_avg_period + 1);
- contrib -= cfs_rq->tg_runnable_contrib;
-
- if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
- atomic_add(contrib, &tg->runnable_avg);
- cfs_rq->tg_runnable_contrib += contrib;
+ if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
-static inline void __update_group_entity_contrib(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg = cfs_rq->tg;
- int runnable_avg;
-
- u64 contrib;
-
- contrib = cfs_rq->tg_load_contrib * tg->shares;
- se->avg.load_avg_contrib = div_u64(contrib,
- atomic_long_read(&tg->load_avg) + 1);
-
- /*
- * For group entities we need to compute a correction term in the case
- * that they are consuming <1 cpu so that we would contribute the same
- * load as a task of equal weight.
- *
- * Explicitly co-ordinating this measurement would be expensive, but
- * fortunately the sum of each cpus contribution forms a usable
- * lower-bound on the true value.
- *
- * Consider the aggregate of 2 contributions. Either they are disjoint
- * (and the sum represents true value) or they are disjoint and we are
- * understating by the aggregate of their overlap.
- *
- * Extending this to N cpus, for a given overlap, the maximum amount we
- * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
- * cpus that overlap for this interval and w_i is the interval width.
- *
- * On a small machine; the first term is well-bounded which bounds the
- * total error since w_i is a subset of the period. Whereas on a
- * larger machine, while this first term can be larger, if w_i is the
- * of consequential size guaranteed to see n_i*w_i quickly converge to
- * our upper bound of 1-cpu.
- */
- runnable_avg = atomic_read(&tg->runnable_avg);
- if (runnable_avg < NICE_0_LOAD) {
- se->avg.load_avg_contrib *= runnable_avg;
- se->avg.load_avg_contrib >>= NICE_0_SHIFT;
- }
-}
-
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
- __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
- __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update) {}
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq) {}
-static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_task_entity_contrib(struct sched_entity *se)
-{
- u32 contrib;
-
- /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
- contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
- contrib /= (se->avg.runnable_avg_period + 1);
- se->avg.load_avg_contrib = scale_load(contrib);
-}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-/* Compute the current contribution to load_avg by se, return any delta */
-static long __update_entity_load_avg_contrib(struct sched_entity *se)
+/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- long old_contrib = se->avg.load_avg_contrib;
+ int decayed;
+ struct sched_avg *sa = &cfs_rq->avg;
- if (entity_is_task(se)) {
- __update_task_entity_contrib(se);
- } else {
- __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
- __update_group_entity_contrib(se);
+ if (atomic_long_read(&cfs_rq->removed_load_avg)) {
+ long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ sa->load_avg = max_t(long, sa->load_avg - r, 0);
+ sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
}
- return se->avg.load_avg_contrib - old_contrib;
-}
+ if (atomic_long_read(&cfs_rq->removed_util_avg)) {
+ long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+ sa->util_avg = max_t(long, sa->util_avg - r, 0);
+ sa->util_sum = max_t(s32, sa->util_sum -
+ ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
+ }
-static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
- long load_contrib)
-{
- if (likely(load_contrib < cfs_rq->blocked_load_avg))
- cfs_rq->blocked_load_avg -= load_contrib;
- else
- cfs_rq->blocked_load_avg = 0;
-}
+ decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ cfs_rq->load_last_update_time_copy = sa->last_update_time;
+#endif
-/* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se,
- int update_cfs_rq)
+ return decayed;
+}
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- long contrib_delta;
- u64 now;
+ int cpu = cpu_of(rq_of(cfs_rq));
+ u64 now = cfs_rq_clock_task(cfs_rq);
/*
- * For a group entity we need to use their owned cfs_rq_clock_task() in
- * case they are the parent of a throttled hierarchy.
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calc in migration
*/
- if (entity_is_task(se))
- now = cfs_rq_clock_task(cfs_rq);
- else
- now = cfs_rq_clock_task(group_cfs_rq(se));
-
- if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
- return;
+ __update_load_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
- contrib_delta = __update_entity_load_avg_contrib(se);
-
- if (!update_cfs_rq)
- return;
-
- if (se->on_rq)
- cfs_rq->runnable_load_avg += contrib_delta;
- else
- subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ update_tg_load_avg(cfs_rq, 0);
}
-/*
- * Decay the load contributed by all blocked children and account this so that
- * their contribution may appropriately discounted when they wake up.
- */
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+/* Add the load generated by se into cfs_rq's load average */
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
- u64 decays;
+ struct sched_avg *sa = &se->avg;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int migrated = 0, decayed;
- decays = now - cfs_rq->last_decay;
- if (!decays && !force_update)
- return;
-
- if (atomic_long_read(&cfs_rq->removed_load)) {
- unsigned long removed_load;
- removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
- subtract_blocked_load_contrib(cfs_rq, removed_load);
+ if (sa->last_update_time == 0) {
+ sa->last_update_time = now;
+ migrated = 1;
+ }
+ else {
+ __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
}
- if (decays) {
- cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
- decays);
- atomic64_add(decays, &cfs_rq->decay_counter);
- cfs_rq->last_decay = now;
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
+
+ cfs_rq->runnable_load_avg += sa->load_avg;
+ cfs_rq->runnable_load_sum += sa->load_sum;
+
+ if (migrated) {
+ cfs_rq->avg.load_avg += sa->load_avg;
+ cfs_rq->avg.load_sum += sa->load_sum;
+ cfs_rq->avg.util_avg += sa->util_avg;
+ cfs_rq->avg.util_sum += sa->util_sum;
}
- __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ if (decayed || migrated)
+ update_tg_load_avg(cfs_rq, 0);
}
-/* Add the load generated by se into cfs_rq's child load-average */
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int wakeup)
+/* Remove the runnable load generated by se from cfs_rq's runnable load average */
+static inline void
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- /*
- * We track migrations using entity decay_count <= 0, on a wake-up
- * migration we use a negative decay count to track the remote decays
- * accumulated while sleeping.
- *
- * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
- * are seen by enqueue_entity_load_avg() as a migration with an already
- * constructed load_avg_contrib.
- */
- if (unlikely(se->avg.decay_count <= 0)) {
- se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
- if (se->avg.decay_count) {
- /*
- * In a wake-up migration we have to approximate the
- * time sleeping. This is because we can't synchronize
- * clock_task between the two cpus, and it is not
- * guaranteed to be read-safe. Instead, we can
- * approximate this using our carried decays, which are
- * explicitly atomically readable.
- */
- se->avg.last_runnable_update -= (-se->avg.decay_count)
- << 20;
- update_entity_load_avg(se, 0);
- /* Indicate that we're now synchronized and on-rq */
- se->avg.decay_count = 0;
- }
- wakeup = 0;
- } else {
- __synchronize_entity_decay(se);
- }
+ update_load_avg(se, 1);
- /* migrated tasks did not contribute to our blocked load */
- if (wakeup) {
- subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
- update_entity_load_avg(se, 0);
- }
-
- cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
- /* we force update consideration on load-balancer moves */
- update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+ cfs_rq->runnable_load_avg =
+ max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
+ cfs_rq->runnable_load_sum =
+ max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
/*
- * Remove se's load from this cfs_rq child load-average, if the entity is
- * transitioning to a blocked state we track its projected decay using
- * blocked_load_avg.
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
*/
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int sleep)
+void remove_entity_load_avg(struct sched_entity *se)
{
- update_entity_load_avg(se, 1);
- /* we force update consideration on load-balancer moves */
- update_cfs_rq_blocked_load(cfs_rq, !sleep);
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 last_update_time_copy;
- cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
- if (sleep) {
- cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
- se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+ do {
+ last_update_time_copy = cfs_rq->load_last_update_time_copy;
+ smp_rmb();
+ last_update_time = cfs_rq->avg.last_update_time;
+ } while (last_update_time != last_update_time_copy);
+#else
+ last_update_time = cfs_rq->avg.last_update_time;
+#endif
+
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
+ atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
/*
@@ -2854,7 +2788,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
*/
void idle_enter_fair(struct rq *this_rq)
{
- update_rq_runnable_avg(this_rq, 1);
}
/*
@@ -2864,24 +2797,28 @@ void idle_enter_fair(struct rq *this_rq)
*/
void idle_exit_fair(struct rq *this_rq)
{
- update_rq_runnable_avg(this_rq, 0);
+}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->runnable_load_avg;
+}
+
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.load_avg;
}
static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_entity_load_avg(struct sched_entity *se,
- int update_cfs_rq) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int wakeup) {}
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int sleep) {}
-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update) {}
+static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline int idle_balance(struct rq *rq)
{
@@ -3013,7 +2950,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
+ enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3088,7 +3025,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
+ dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -3178,6 +3115,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -3277,7 +3215,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
- update_entity_load_avg(prev, 1);
+ update_load_avg(prev, 0);
}
cfs_rq->curr = NULL;
}
@@ -3293,8 +3231,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_entity_load_avg(curr, 1);
- update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_load_avg(curr, 1);
update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
@@ -3413,16 +3350,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
- /*
- * If the bandwidth pool has become inactive, then at least one
- * period must have elapsed since the last consumption.
- * Refresh the global state and ensure bandwidth timer becomes
- * active.
- */
- if (!cfs_b->timer_active) {
- __refill_cfs_bandwidth_runtime(cfs_b);
- __start_cfs_bandwidth(cfs_b, false);
- }
+ start_cfs_bandwidth(cfs_b);
if (cfs_b->runtime > 0) {
amount = min(cfs_b->runtime, min_amount);
@@ -3571,6 +3499,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
+ bool empty;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -3600,13 +3529,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
+ empty = list_empty(&cfs_b->throttled_cfs_rq);
+
/*
* Add to the _head_ of the list, so that an already-started
* distribute_cfs_runtime will not see us
*/
list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- if (!cfs_b->timer_active)
- __start_cfs_bandwidth(cfs_b, false);
+
+ /*
+ * If we're the first throttled task, make sure the bandwidth
+ * timer is running.
+ */
+ if (empty)
+ start_cfs_bandwidth(cfs_b);
+
raw_spin_unlock(&cfs_b->lock);
}
@@ -3721,13 +3658,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (cfs_b->idle && !throttled)
goto out_deactivate;
- /*
- * if we have relooped after returning idle once, we need to update our
- * status as actually running, so that other cpus doing
- * __start_cfs_bandwidth will stop trying to cancel us.
- */
- cfs_b->timer_active = 1;
-
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
@@ -3772,7 +3702,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
return 0;
out_deactivate:
- cfs_b->timer_active = 0;
return 1;
}
@@ -3787,7 +3716,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
* Are we near the end of the current quota period?
*
* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
- * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * hrtimer base being cleared by hrtimer_start. In the case of
* migrate_hrtimers, base is never cleared, so we are fine.
*/
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
@@ -3815,8 +3744,9 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
if (runtime_refresh_within(cfs_b, min_left))
return;
- start_bandwidth_timer(&cfs_b->slack_timer,
- ns_to_ktime(cfs_bandwidth_slack_period));
+ hrtimer_start(&cfs_b->slack_timer,
+ ns_to_ktime(cfs_bandwidth_slack_period),
+ HRTIMER_MODE_REL);
}
/* we know any runtime found here is valid as update_curr() precedes return */
@@ -3936,6 +3866,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, slack_timer);
+
do_sched_cfs_slack_timer(cfs_b);
return HRTIMER_NORESTART;
@@ -3945,20 +3876,19 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
- ktime_t now;
int overrun;
int idle = 0;
raw_spin_lock(&cfs_b->lock);
for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, cfs_b->period);
-
+ overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
+ if (idle)
+ cfs_b->period_active = 0;
raw_spin_unlock(&cfs_b->lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -3972,7 +3902,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period = ns_to_ktime(default_cfs_period());
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
@@ -3984,28 +3914,15 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}
-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
- /*
- * The timer may be active because we're trying to set a new bandwidth
- * period or because we're racing with the tear-down path
- * (timer_active==0 becomes visible before the hrtimer call-back
- * terminates). In either case we ensure that it's re-programmed
- */
- while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
- hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
- /* bounce the lock to allow do_sched_cfs_period_timer to run */
- raw_spin_unlock(&cfs_b->lock);
- cpu_relax();
- raw_spin_lock(&cfs_b->lock);
- /* if someone else restarted the timer then we're done */
- if (!force && cfs_b->timer_active)
- return;
- }
+ lockdep_assert_held(&cfs_b->lock);
- cfs_b->timer_active = 1;
- start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+ if (!cfs_b->period_active) {
+ cfs_b->period_active = 1;
+ hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+ hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+ }
}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -4187,14 +4104,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
+ update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
- update_entity_load_avg(se, 1);
}
- if (!se) {
- update_rq_runnable_avg(rq, rq->nr_running);
+ if (!se)
add_nr_running(rq, 1);
- }
+
hrtick_update(rq);
}
@@ -4248,22 +4164,204 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
+ update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
- update_entity_load_avg(se, 1);
}
- if (!se) {
+ if (!se)
sub_nr_running(rq, 1);
- update_rq_runnable_avg(rq, 1);
- }
+
hrtick_update(rq);
}
#ifdef CONFIG_SMP
+
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT 7
+static const unsigned char
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ {64, 32, 8, 0, 0, 0, 0, 0},
+ {96, 72, 40, 12, 1, 0, 0},
+ {112, 98, 75, 43, 15, 1, 0},
+ {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
+{
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ }
+
+ sched_avg_update(this_rq);
+}
+
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->cfs.runnable_load_avg;
+ return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
+ unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ unsigned long pending_updates;
+
+ /*
+ * bail if there's load or we're actually up-to-date.
+ */
+ if (load || curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
+ __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+ unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ /*
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ */
+ this_rq->last_load_update_tick = jiffies;
+ __update_cpu_load(this_rq, load, 1);
}
/*
@@ -4304,11 +4402,16 @@ static unsigned long capacity_of(int cpu)
return cpu_rq(cpu)->cpu_capacity;
}
+static unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = rq->cfs.runnable_load_avg;
+ unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
+ unsigned long load_avg = weighted_cpuload(cpu);
if (nr_running)
return load_avg / nr_running;
@@ -4427,7 +4530,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
/*
* w = rw_i + @wl
*/
- w = se->my_q->load.weight + wl;
+ w = cfs_rq_load_avg(se->my_q) + wl;
/*
* wl = S * s'_i; see (2)
@@ -4448,7 +4551,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
/*
* wl = dw_i = S * (s'_i - s_i); see (3)
*/
- wl -= se->load.weight;
+ wl -= se->avg.load_avg;
/*
* Recursively apply this logic to all parent groups to compute
@@ -4471,26 +4574,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
#endif
+/*
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees. In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other. With
+ * both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size. Waker/wakee
+ * being client/server, worker/dispatcher, interrupt source or whatever is
+ * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ */
static int wake_wide(struct task_struct *p)
{
+ unsigned int master = current->wakee_flips;
+ unsigned int slave = p->wakee_flips;
int factor = this_cpu_read(sd_llc_size);
- /*
- * Yeah, it's the switching-frequency, could means many wakee or
- * rapidly switch, use factor here will just help to automatically
- * adjust the loose-degree, so bigger node will lead to more pull.
- */
- if (p->wakee_flips > factor) {
- /*
- * wakee is somewhat hot, it needs certain amount of cpu
- * resource, so if waker is far more hot, prefer to leave
- * it alone.
- */
- if (current->wakee_flips > (factor * p->wakee_flips))
- return 1;
- }
-
- return 0;
+ if (master < slave)
+ swap(master, slave);
+ if (slave < factor || master < slave * factor)
+ return 0;
+ return 1;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4502,13 +4608,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long weight;
int balanced;
- /*
- * If we wake multiple tasks be careful to not bounce
- * ourselves around too much.
- */
- if (wake_wide(p))
- return 0;
-
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -4522,14 +4621,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
*/
if (sync) {
tg = task_group(current);
- weight = current->se.load.weight;
+ weight = current->se.avg.load_avg;
this_load += effective_load(tg, this_cpu, -weight, -weight);
load += effective_load(tg, prev_cpu, 0, -weight);
}
tg = task_group(p);
- weight = p->se.load.weight;
+ weight = p->se.avg.load_avg;
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -4717,6 +4816,33 @@ next:
done:
return target;
}
+/*
+ * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the usage with the capacity of the CPU that is available for CFS
+ * task (ie cpu_capacity).
+ * cfs.avg.util_avg is the sum of running time of runnable tasks on a
+ * CPU. It represents the amount of utilization of a CPU in the range
+ * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
+ * capacity of the CPU because it's about the running time on this CPU.
+ * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in util_avg or just
+ * after migrating tasks until the average stabilizes with the new running
+ * time. So we need to check that the usage stays into the range
+ * [0..cpu_capacity_orig] and cap if necessary.
+ * Without capping the usage, a group could be seen as overloaded (CPU0 usage
+ * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ */
+static int get_cpu_usage(int cpu)
+{
+ unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+ if (usage >= SCHED_LOAD_SCALE)
+ return capacity;
+
+ return (usage * capacity) >> SCHED_LOAD_SHIFT;
+}
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -4735,17 +4861,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int new_cpu = cpu;
+ int new_cpu = prev_cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE)
- want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
- continue;
+ break;
/*
* If both cpu and prev_cpu are part of this domain,
@@ -4759,17 +4885,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (tmp->flags & sd_flag)
sd = tmp;
+ else if (!want_affine)
+ break;
}
- if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
-
- if (sd_flag & SD_BALANCE_WAKE) {
- new_cpu = select_idle_sibling(p, prev_cpu);
- goto unlock;
+ if (affine_sd) {
+ sd = NULL; /* Prefer wake_affine over balance flags */
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ new_cpu = cpu;
}
- while (sd) {
+ if (!sd) {
+ if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ new_cpu = select_idle_sibling(p, new_cpu);
+
+ } else while (sd) {
struct sched_group *group;
int weight;
@@ -4803,7 +4933,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
/* while loop will break here if sd == NULL */
}
-unlock:
rcu_read_unlock();
return new_cpu;
@@ -4815,26 +4944,27 @@ unlock:
* previous cpu. However, the caller only guarantees p->pi_lock is held; no
* other assumptions, including the state of rq->lock, should be made.
*/
-static void
-migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
/*
- * Load tracking: accumulate removed load so that it can be processed
- * when we next update owning cfs_rq under rq->lock. Tasks contribute
- * to blocked load iff they have a positive decay-count. It can never
- * be negative here since on-rq tasks have decay-count == 0.
+ * We are supposed to update the task to "current" time, then its up to date
+ * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
+ * what current time is, so simply throw away the out-of-date time. This
+ * will result in the wakee task is less decayed, but giving the wakee more
+ * load sounds not bad.
*/
- if (se->avg.decay_count) {
- se->avg.decay_count = -__synchronize_entity_decay(se);
- atomic_long_add(se->avg.load_avg_contrib,
- &cfs_rq->removed_load);
- }
+ remove_entity_load_avg(&p->se);
+
+ /* Tell new CPU we are migrated */
+ p->se.avg.last_update_time = 0;
/* We have migrated, no longer consider this task hot */
- se->exec_start = 0;
+ p->se.exec_start = 0;
+}
+
+static void task_dead_fair(struct task_struct *p)
+{
+ remove_entity_load_avg(&p->se);
}
#endif /* CONFIG_SMP */
@@ -5031,18 +5161,21 @@ again:
* entity, update_curr() will update its vruntime, otherwise
* forget we've ever seen it.
*/
- if (curr && curr->on_rq)
- update_curr(cfs_rq);
- else
- curr = NULL;
+ if (curr) {
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
- /*
- * This call to check_cfs_rq_runtime() will do the throttle and
- * dequeue its entity in the parent(s). Therefore the 'simple'
- * nr_running test will indeed be correct.
- */
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto simple;
+ /*
+ * This call to check_cfs_rq_runtime() will do the
+ * throttle and dequeue its entity in the parent(s).
+ * Therefore the 'simple' nr_running test will indeed
+ * be correct.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto simple;
+ }
se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
@@ -5103,7 +5236,15 @@ simple:
return p;
idle:
+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ lockdep_unpin_lock(&rq->lock);
new_tasks = idle_balance(rq);
+ lockdep_pin_lock(&rq->lock);
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
@@ -5372,90 +5513,57 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
}
#ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
-static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+/*
+ * Returns 1, if task migration degrades locality
+ * Returns 0, if task migration improves locality i.e migration preferred.
+ * Returns -1, if task migration is not affected by locality.
+ */
+static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ unsigned long src_faults, dst_faults;
int src_nid, dst_nid;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
- !(env->sd->flags & SD_NUMA)) {
- return false;
- }
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+ return -1;
+
+ if (!sched_feat(NUMA))
+ return -1;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
if (src_nid == dst_nid)
- return false;
-
- if (numa_group) {
- /* Task is already in the group's interleave set. */
- if (node_isset(src_nid, numa_group->active_nodes))
- return false;
-
- /* Task is moving into the group's interleave set. */
- if (node_isset(dst_nid, numa_group->active_nodes))
- return true;
+ return -1;
- return group_faults(p, dst_nid) > group_faults(p, src_nid);
+ /* Migrating away from the preferred node is always bad. */
+ if (src_nid == p->numa_preferred_nid) {
+ if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
+ return 1;
+ else
+ return -1;
}
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
- return true;
-
- return task_faults(p, dst_nid) > task_faults(p, src_nid);
-}
-
-
-static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
-{
- struct numa_group *numa_group = rcu_dereference(p->numa_group);
- int src_nid, dst_nid;
-
- if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
- return false;
-
- if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
- return false;
-
- src_nid = cpu_to_node(env->src_cpu);
- dst_nid = cpu_to_node(env->dst_cpu);
-
- if (src_nid == dst_nid)
- return false;
+ return 0;
if (numa_group) {
- /* Task is moving within/into the group's interleave set. */
- if (node_isset(dst_nid, numa_group->active_nodes))
- return false;
-
- /* Task is moving out of the group's interleave set. */
- if (node_isset(src_nid, numa_group->active_nodes))
- return true;
-
- return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ src_faults = group_faults(p, src_nid);
+ dst_faults = group_faults(p, dst_nid);
+ } else {
+ src_faults = task_faults(p, src_nid);
+ dst_faults = task_faults(p, dst_nid);
}
- /* Migrating away from the preferred node is always bad. */
- if (src_nid == p->numa_preferred_nid)
- return true;
-
- return task_faults(p, dst_nid) < task_faults(p, src_nid);
+ return dst_faults < src_faults;
}
#else
-static inline bool migrate_improves_locality(struct task_struct *p,
+static inline int migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
- return false;
-}
-
-static inline bool migrate_degrades_locality(struct task_struct *p,
- struct lb_env *env)
-{
- return false;
+ return -1;
}
#endif
@@ -5465,7 +5573,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
- int tsk_cache_hot = 0;
+ int tsk_cache_hot;
lockdep_assert_held(&env->src_rq->lock);
@@ -5523,13 +5631,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, env);
- if (!tsk_cache_hot)
- tsk_cache_hot = migrate_degrades_locality(p, env);
+ tsk_cache_hot = migrate_degrades_locality(p, env);
+ if (tsk_cache_hot == -1)
+ tsk_cache_hot = task_hot(p, env);
- if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+ if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- if (tsk_cache_hot) {
+ if (tsk_cache_hot == 1) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
@@ -5603,6 +5711,13 @@ static int detach_tasks(struct lb_env *env)
return 0;
while (!list_empty(tasks)) {
+ /*
+ * We don't want to steal all, otherwise we may be treated likewise,
+ * which could at worst lead to a livelock crash.
+ */
+ if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ break;
+
p = list_first_entry(tasks, struct task_struct, se.group_node);
env->loop++;
@@ -5712,39 +5827,6 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * update tg->load_weight by folding this cpu's load_avg
- */
-static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
-{
- struct sched_entity *se = tg->se[cpu];
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
-
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- return;
-
- update_cfs_rq_blocked_load(cfs_rq, 1);
-
- if (se) {
- update_entity_load_avg(se, 1);
- /*
- * We pivot on our runnable average having decayed to zero for
- * list removal. This generally implies that all our children
- * have also been removed (modulo rounding error or bandwidth
- * control); however, such cases are rare and we can fix these
- * at enqueue.
- *
- * TODO: fix up out-of-order children on enqueue.
- */
- if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
- list_del_leaf_cfs_rq(cfs_rq);
- } else {
- struct rq *rq = rq_of(cfs_rq);
- update_rq_runnable_avg(rq, rq->nr_running);
- }
-}
-
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -5753,19 +5835,19 @@ static void update_blocked_averages(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
+
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /*
- * Note: We may want to consider periodically releasing
- * rq->lock about these updates so that creating many task
- * groups does not result in continually extending hold time.
- */
- __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
- }
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ continue;
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ update_tg_load_avg(cfs_rq, 0);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -5793,14 +5875,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
}
if (!se) {
- cfs_rq->h_load = cfs_rq->runnable_load_avg;
+ cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
cfs_rq->last_h_load_update = now;
}
while ((se = cfs_rq->h_load_next) != NULL) {
load = cfs_rq->h_load;
- load = div64_ul(load * se->avg.load_avg_contrib,
- cfs_rq->runnable_load_avg + 1);
+ load = div64_ul(load * se->avg.load_avg,
+ cfs_rq_load_avg(cfs_rq) + 1);
cfs_rq = group_cfs_rq(se);
cfs_rq->h_load = load;
cfs_rq->last_h_load_update = now;
@@ -5812,17 +5894,25 @@ static unsigned long task_h_load(struct task_struct *p)
struct cfs_rq *cfs_rq = task_cfs_rq(p);
update_cfs_rq_h_load(cfs_rq);
- return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
- cfs_rq->runnable_load_avg + 1);
+ return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
+ cfs_rq_load_avg(cfs_rq) + 1);
}
#else
static inline void update_blocked_averages(int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
static unsigned long task_h_load(struct task_struct *p)
{
- return p->se.avg.load_avg_contrib;
+ return p->se.avg.load_avg;
}
#endif
@@ -5843,12 +5933,12 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
+ unsigned long group_usage; /* Total usage of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
- int group_has_free_capacity;
+ int group_no_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5919,16 +6009,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
-{
- return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
-{
- return default_scale_capacity(sd, cpu);
-}
-
static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
@@ -5945,15 +6025,15 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available, age_stamp, avg;
+ u64 total, used, age_stamp, avg;
s64 delta;
/*
* Since we're reading these variables without serialization make sure
* we read them once before doing sanity checks on them.
*/
- age_stamp = ACCESS_ONCE(rq->age_stamp);
- avg = ACCESS_ONCE(rq->rt_avg);
+ age_stamp = READ_ONCE(rq->age_stamp);
+ avg = READ_ONCE(rq->rt_avg);
delta = __rq_clock_broken(rq) - age_stamp;
if (unlikely(delta < 0))
@@ -5961,19 +6041,12 @@ static unsigned long scale_rt_capacity(int cpu)
total = sched_avg_period() + delta;
- if (unlikely(total < avg)) {
- /* Ensures that capacity won't end up being negative */
- available = 0;
- } else {
- available = total - avg;
- }
-
- if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
- total = SCHED_CAPACITY_SCALE;
+ used = div_u64(avg, total);
- total >>= SCHED_CAPACITY_SHIFT;
+ if (likely(used < SCHED_CAPACITY_SCALE))
+ return SCHED_CAPACITY_SCALE - used;
- return div_u64(available, total);
+ return 1;
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -5988,14 +6061,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
capacity >>= SCHED_CAPACITY_SHIFT;
- sdg->sgc->capacity_orig = capacity;
-
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_freq_capacity(sd, cpu);
- else
- capacity *= default_scale_capacity(sd, cpu);
-
- capacity >>= SCHED_CAPACITY_SHIFT;
+ cpu_rq(cpu)->cpu_capacity_orig = capacity;
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6011,7 +6077,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, capacity_orig;
+ unsigned long capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6023,7 +6089,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
return;
}
- capacity_orig = capacity = 0;
+ capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -6043,19 +6109,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
* Use capacity_of(), which is set irrespective of domains
* in update_cpu_capacity().
*
- * This avoids capacity/capacity_orig from being 0 and
+ * This avoids capacity from being 0 and
* causing divide-by-zero issues on boot.
- *
- * Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
- capacity_orig += capacity_of(cpu);
capacity += capacity_of(cpu);
continue;
}
sgc = rq->sd->groups->sgc;
- capacity_orig += sgc->capacity_orig;
capacity += sgc->capacity;
}
} else {
@@ -6066,39 +6128,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity_orig += group->sgc->capacity_orig;
capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
- sdg->sgc->capacity_orig = capacity_orig;
sdg->sgc->capacity = capacity;
}
/*
- * Try and fix up capacity for tiny siblings, this is needed when
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
- * which on its own isn't powerful enough.
- *
- * See update_sd_pick_busiest() and check_asym_packing().
+ * Check whether the capacity of the rq has been noticeably reduced by side
+ * activity. The imbalance_pct is used for the threshold.
+ * Return true is the capacity is reduced
*/
static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
- /*
- * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
- */
- if (!(sd->flags & SD_SHARE_CPUCAPACITY))
- return 0;
-
- /*
- * If ~90% of the cpu_capacity is still there, we're good.
- */
- if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
- return 1;
-
- return 0;
+ return ((rq->cpu_capacity * sd->imbalance_pct) <
+ (rq->cpu_capacity_orig * 100));
}
/*
@@ -6136,37 +6183,56 @@ static inline int sg_imbalanced(struct sched_group *group)
}
/*
- * Compute the group capacity factor.
- *
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
- * first dividing out the smt factor and computing the actual number of cores
- * and limit unit capacity with that.
+ * group_has_capacity returns true if the group has spare capacity that could
+ * be used by some tasks.
+ * We consider that a group has spare capacity if the * number of task is
+ * smaller than the number of CPUs or if the usage is lower than the available
+ * capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
*/
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
{
- unsigned int capacity_factor, smt, cpus;
- unsigned int capacity, capacity_orig;
+ if (sgs->sum_nr_running < sgs->group_weight)
+ return true;
- capacity = group->sgc->capacity;
- capacity_orig = group->sgc->capacity_orig;
- cpus = group->group_weight;
+ if ((sgs->group_capacity * 100) >
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
- capacity_factor = cpus / smt; /* cores */
+ return false;
+}
+
+/*
+ * group_is_overloaded returns true if the group has more tasks than it can
+ * handle.
+ * group_is_overloaded is not equals to !group_has_capacity because a group
+ * with the exact right number of tasks, has no more spare capacity but is not
+ * overloaded so both group_has_capacity and group_is_overloaded return
+ * false.
+ */
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running <= sgs->group_weight)
+ return false;
- capacity_factor = min_t(unsigned,
- capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
+ if ((sgs->group_capacity * 100) <
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
- return capacity_factor;
+ return false;
}
-static enum group_type
-group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+static enum group_type group_classify(struct lb_env *env,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_no_capacity)
return group_overloaded;
if (sg_imbalanced(group))
@@ -6204,6 +6270,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = source_load(i, load_idx);
sgs->group_load += load;
+ sgs->group_usage += get_cpu_usage(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
if (rq->nr_running > 1)
@@ -6226,11 +6293,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
- sgs->group_capacity_factor = sg_capacity_factor(env, group);
- sgs->group_type = group_classify(group, sgs);
- if (sgs->group_capacity_factor > sgs->sum_nr_running)
- sgs->group_has_free_capacity = 1;
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(env, group, sgs);
}
/**
@@ -6352,18 +6417,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity factor to one so that we'll try
+ * first, lower the sg capacity so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity_factor. The
- * extra check prevents the case where you always pull from the
- * heaviest group when it is already under-utilized (possible
- * with a large weight task outweighs the tasks on the system).
+ * these excess tasks. The extra check prevents the case where
+ * you always pull from the heaviest group when it is already
+ * under-utilized (possible with a large weight task outweighs
+ * the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity) {
- sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
- sgs->group_type = group_classify(sg, sgs);
+ group_has_capacity(env, &sds->local_stat) &&
+ (sgs->sum_nr_running > 1)) {
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_overloaded;
}
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -6543,11 +6609,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
- load_above_capacity =
- (busiest->sum_nr_running - busiest->group_capacity_factor);
-
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
- load_above_capacity /= busiest->group_capacity;
+ load_above_capacity = busiest->sum_nr_running *
+ SCHED_LOAD_SCALE;
+ if (load_above_capacity > busiest->group_capacity)
+ load_above_capacity -= busiest->group_capacity;
+ else
+ load_above_capacity = ~0UL;
}
/*
@@ -6610,6 +6677,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
local = &sds.local_stat;
busiest = &sds.busiest_stat;
+ /* ASYM feature bypasses nice load balance check */
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
check_asym_packing(env, &sds))
return sds.busiest;
@@ -6630,8 +6698,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
- !busiest->group_has_free_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ busiest->group_no_capacity)
goto force_balance;
/*
@@ -6690,7 +6758,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long capacity, capacity_factor, wl;
+ unsigned long capacity, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6719,9 +6787,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
continue;
capacity = capacity_of(i);
- capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
@@ -6729,7 +6794,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu capacity.
*/
- if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
+
+ if (rq->nr_running == 1 && wl > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
continue;
/*
@@ -6777,6 +6844,19 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ /*
+ * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+ * It's worth migrating the task if the src_cpu's capacity is reduced
+ * because of other sched_class or IRQs if more capacity stays
+ * available on dst_cpu.
+ */
+ if ((env->idle != CPU_NOT_IDLE) &&
+ (env->src_rq->cfs.h_nr_running == 1)) {
+ if ((check_cpu_capacity(env->src_rq, sd)) &&
+ (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -6876,6 +6956,9 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+
ld_moved = 0;
if (busiest->nr_running > 1) {
/*
@@ -6885,8 +6968,6 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
@@ -7140,9 +7221,6 @@ static int idle_balance(struct rq *this_rq)
goto out;
}
- /*
- * Drop the rq->lock, but keep IRQ/preempt disabled.
- */
raw_spin_unlock(&this_rq->lock);
update_blocked_averages(this_cpu);
@@ -7586,22 +7664,25 @@ end:
/*
* Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
* - This rq has more than one task.
- * - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's capacity.
+ * - This rq has at least one CFS task and the capacity of the CPU is
+ * significantly reduced because of RT tasks or IRQs.
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ * multiple busy cpu.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
+ bool kick = false;
if (unlikely(rq->idle_balance))
- return 0;
+ return false;
/*
* We may be recently in ticked or tickless idle mode. At the first
@@ -7615,38 +7696,46 @@ static inline int nohz_kick_needed(struct rq *rq)
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
- return 0;
+ return false;
if (time_before(now, nohz.next_balance))
- return 0;
+ return false;
if (rq->nr_running >= 2)
- goto need_kick;
+ return true;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
if (sd) {
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
- if (nr_busy > 1)
- goto need_kick_unlock;
+ if (nr_busy > 1) {
+ kick = true;
+ goto unlock;
+ }
+
}
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ sd = rcu_dereference(rq->sd);
+ if (sd) {
+ if ((rq->cfs.h_nr_running >= 1) &&
+ check_cpu_capacity(rq, sd)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
- goto need_kick_unlock;
-
- rcu_read_unlock();
- return 0;
+ sched_domain_span(sd)) < cpu)) {
+ kick = true;
+ goto unlock;
+ }
-need_kick_unlock:
+unlock:
rcu_read_unlock();
-need_kick:
- return 1;
+ return kick;
}
#else
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
@@ -7662,14 +7751,16 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
- rebalance_domains(this_rq, idle);
-
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
- * stopped.
+ * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * give the idle cpus a chance to load balance. Else we may
+ * load balance only within the local sched_domain hierarchy
+ * and abort nohz_idle_balance altogether if we pull some load.
*/
nohz_idle_balance(this_rq, idle);
+ rebalance_domains(this_rq, idle);
}
/*
@@ -7721,8 +7812,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (numabalancing_enabled)
task_tick_numa(rq, curr);
-
- update_rq_runnable_avg(rq, 1);
}
/*
@@ -7821,15 +7910,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
}
#ifdef CONFIG_SMP
- /*
- * Remove our load from contribution when we leave sched_fair
- * and ensure we don't carry in an old decay_count if we
- * switch back.
- */
- if (se->avg.decay_count) {
- __synchronize_entity_decay(se);
- subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
- }
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg,
+ se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
+
+ cfs_rq->avg.load_avg =
+ max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+ cfs_rq->avg.load_sum =
+ max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+ cfs_rq->avg.util_avg =
+ max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+ cfs_rq->avg.util_sum =
+ max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
#endif
}
@@ -7838,16 +7930,31 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity *se = &p->se;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Since the real-depth could have been changed (only FAIR
* class maintain depth value), reset depth properly.
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- if (!task_on_rq_queued(p))
+
+ if (!task_on_rq_queued(p)) {
+
+ /*
+ * Ensure the task has a non-normalized vruntime when it is switched
+ * back to the fair class with !queued, so that enqueue_entity() at
+ * wake-up time will do the right thing.
+ *
+ * If it's queued, then the enqueue_entity(.flags=0) makes the task
+ * has non-normalized vruntime, if it's !queued, then it still has
+ * normalized vruntime.
+ */
+ if (p->state != TASK_RUNNING)
+ se->vruntime += cfs_rq_of(se)->min_vruntime;
return;
+ }
/*
* We were most likely switched from sched_rt, so
@@ -7886,8 +7993,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
- atomic64_set(&cfs_rq->decay_counter, 1);
- atomic_long_set(&cfs_rq->removed_load, 0);
+ atomic_long_set(&cfs_rq->removed_load_avg, 0);
+ atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
}
@@ -7932,14 +8039,14 @@ static void task_move_group_fair(struct task_struct *p, int queued)
if (!queued) {
cfs_rq = cfs_rq_of(se);
se->vruntime += cfs_rq->min_vruntime;
+
#ifdef CONFIG_SMP
- /*
- * migrate_task_rq_fair() will have removed our previous
- * contribution, but we must synchronize for ongoing future
- * decay.
- */
- se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ /* Virtually synchronize task with its new cfs_rq */
+ p->se.avg.last_update_time = cfs_rq->avg.last_update_time;
+ cfs_rq->avg.load_avg += p->se.avg.load_avg;
+ cfs_rq->avg.load_sum += p->se.avg.load_sum;
+ cfs_rq->avg.util_avg += p->se.avg.util_avg;
+ cfs_rq->avg.util_sum += p->se.avg.util_sum;
#endif
}
}
@@ -7953,8 +8060,11 @@ void free_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
- if (tg->se)
+ if (tg->se) {
+ if (tg->se[i])
+ remove_entity_load_avg(tg->se[i]);
kfree(tg->se[i]);
+ }
}
kfree(tg->cfs_rq);
@@ -7991,6 +8101,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ init_entity_runnable_average(se);
}
return 1;
@@ -8140,6 +8251,8 @@ const struct sched_class fair_sched_class = {
.rq_offline = rq_offline_fair,
.task_waking = task_waking_fair,
+ .task_dead = task_dead_fair,
+ .set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_fair,
@@ -8169,7 +8282,27 @@ void print_cfs_stats(struct seq_file *m, int cpu)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
-#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+void show_numa_stats(struct task_struct *p, struct seq_file *m)
+{
+ int node;
+ unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+
+ for_each_online_node(node) {
+ if (p->numa_faults) {
+ tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
+ tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ if (p->numa_group) {
+ gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ print_numa_stats(m, node, tsf, tpf, gsf, gpf);
+ }
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
__init void init_sched_fair_class(void)
{
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d117fe6..83a50e7ca533 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * In order to avoid a thundering herd attack of CPUs that are
+ * lowering their priorities at the same time, and there being
+ * a single CPU that has an RT task that can migrate and is waiting
+ * to run, where the other CPUs will try to take that CPUs
+ * rq lock and possibly create a large contention, sending an
+ * IPI to that CPU and let that CPU push the RT task to where
+ * it should go may be a better scenario.
+ */
+SCHED_FEAT(RT_PUSH_IPI, true)
+#endif
+
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
@@ -66,20 +79,12 @@ SCHED_FEAT(LB_MIN, false)
* numa_balancing=
*/
#ifdef CONFIG_NUMA_BALANCING
-SCHED_FEAT(NUMA, false)
-
-/*
- * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
- * higher number of hinting faults are recorded during active load
- * balancing.
- */
-SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
/*
- * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
- * lower number of hinting faults have been recorded. As this has
- * the potential to prevent a task ever migrating to a new node
- * due to CPU overload it is disabled by default.
+ * NUMA will favor moving tasks towards nodes where a higher number of
+ * hinting faults are recorded during active load balancing. It will
+ * resist moving tasks towards nodes where a lower number of hinting
+ * faults have been recorded.
*/
-SCHED_FEAT(NUMA_RESIST_LOWER, false)
+SCHED_FEAT(NUMA, true)
#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80014a178342..8f177c73ae19 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -15,6 +15,15 @@
#include "sched.h"
+/**
+ * sched_idle_set_state - Record idle state for the current CPU.
+ * @idle_state: State to record.
+ */
+void sched_idle_set_state(struct cpuidle_state *idle_state)
+{
+ idle_set_state(this_rq(), idle_state);
+}
+
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
@@ -68,6 +77,49 @@ void __weak arch_cpu_idle(void)
}
/**
+ * default_idle_call - Default CPU idle routine.
+ *
+ * To use when the cpuidle framework cannot be used.
+ */
+void default_idle_call(void)
+{
+ if (current_clr_polling_and_test()) {
+ local_irq_enable();
+ } else {
+ stop_critical_timings();
+ arch_cpu_idle();
+ start_critical_timings();
+ }
+}
+
+static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ int next_state)
+{
+ /* Fall back to the default arch idle method on errors. */
+ if (next_state < 0) {
+ default_idle_call();
+ return next_state;
+ }
+
+ /*
+ * The idle task must be scheduled, it is pointless to go to idle, just
+ * update no idle residency and return.
+ */
+ if (current_clr_polling_and_test()) {
+ dev->last_residency = 0;
+ local_irq_enable();
+ return -EBUSY;
+ }
+
+ /*
+ * Enter the idle state previously returned by the governor decision.
+ * This function will block until an interrupt occurs and will take
+ * care of re-enabling the local interrupts
+ */
+ return cpuidle_enter(drv, dev, next_state);
+}
+
+/**
* cpuidle_idle_call - the main idle function
*
* NOTE: no locks or semaphores should be used here
@@ -81,8 +133,6 @@ static void cpuidle_idle_call(void)
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state;
- unsigned int broadcast;
- bool reflect;
/*
* Check if the idle task must be rescheduled. If it is the
@@ -94,20 +144,16 @@ static void cpuidle_idle_call(void)
}
/*
- * During the idle period, stop measuring the disabled irqs
- * critical sections latencies
- */
- stop_critical_timings();
-
- /*
* Tell the RCU framework we are entering an idle section,
* so no more rcu read side critical sections and one more
* step to the grace period
*/
rcu_idle_enter();
- if (cpuidle_not_available(drv, dev))
- goto use_default;
+ if (cpuidle_not_available(drv, dev)) {
+ default_idle_call();
+ goto exit_idle;
+ }
/*
* Suspend-to-idle ("freeze") is a system state in which all user space
@@ -125,64 +171,19 @@ static void cpuidle_idle_call(void)
goto exit_idle;
}
- reflect = false;
next_state = cpuidle_find_deepest_state(drv, dev);
+ call_cpuidle(drv, dev, next_state);
} else {
- reflect = true;
/*
* Ask the cpuidle framework to choose a convenient idle state.
*/
next_state = cpuidle_select(drv, dev);
- }
- /* Fall back to the default arch idle method on errors. */
- if (next_state < 0)
- goto use_default;
-
- /*
- * The idle task must be scheduled, it is pointless to
- * go to idle, just update no idle residency and get
- * out of this function
- */
- if (current_clr_polling_and_test()) {
- dev->last_residency = 0;
- entered_state = next_state;
- local_irq_enable();
- goto exit_idle;
- }
-
- broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
-
- /*
- * Tell the time framework to switch to a broadcast timer
- * because our local timer will be shutdown. If a local timer
- * is used from another cpu as a broadcast timer, this call may
- * fail if it is not available
- */
- if (broadcast &&
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
- goto use_default;
-
- /* Take note of the planned idle state. */
- idle_set_state(this_rq(), &drv->states[next_state]);
-
- /*
- * Enter the idle state previously returned by the governor decision.
- * This function will block until an interrupt occurs and will take
- * care of re-enabling the local interrupts
- */
- entered_state = cpuidle_enter(drv, dev, next_state);
-
- /* The cpu is no longer idle or about to enter idle. */
- idle_set_state(this_rq(), NULL);
-
- if (broadcast)
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
-
- /*
- * Give the governor an opportunity to reflect on the outcome
- */
- if (reflect)
+ entered_state = call_cpuidle(drv, dev, next_state);
+ /*
+ * Give the governor an opportunity to reflect on the outcome
+ */
cpuidle_reflect(dev, entered_state);
+ }
exit_idle:
__current_set_polling();
@@ -194,22 +195,10 @@ exit_idle:
local_irq_enable();
rcu_idle_exit();
- start_critical_timings();
- return;
-
-use_default:
- /*
- * We can't use the cpuidle framework, let's use the default
- * idle routine.
- */
- if (current_clr_polling_and_test())
- local_irq_enable();
- else
- arch_cpu_idle();
-
- goto exit_idle;
}
+DEFINE_PER_CPU(bool, cpu_dead_idle);
+
/*
* Generic idle loop implementation
*
@@ -234,8 +223,13 @@ static void cpu_idle_loop(void)
check_pgt_cache();
rmb();
- if (cpu_is_offline(smp_processor_id()))
+ if (cpu_is_offline(smp_processor_id())) {
+ rcu_cpu_notify(NULL, CPU_DYING_IDLE,
+ (void *)(long)smp_processor_id());
+ smp_mb(); /* all activity before dead. */
+ this_cpu_write(cpu_dead_idle, true);
arch_cpu_idle_dead();
+ }
local_irq_disable();
arch_cpu_idle_enter();
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c65dac8c97cd..c4ae0f1fdf9b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
+ .set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c
index 8ecd552fe4f2..ef7159012cf3 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
/*
- * kernel/sched/proc.c
+ * kernel/sched/loadavg.c
*
- * Kernel load calculations, forked from sched/core.c
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
*/
#include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
long nr_active, delta = 0;
nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
+ nr_active += (long)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
delta = calc_load_fold_active(this_rq);
if (delta) {
int idx = calc_load_write_idx();
+
atomic_long_add(delta, &calc_load_idle[idx]);
}
}
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
{
unsigned long result = 1UL << frac_bits;
- if (n) for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
+ if (n) {
+ for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
}
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
}
return result;
@@ -285,7 +290,6 @@ static unsigned long
calc_load_n(unsigned long load, unsigned long exp,
unsigned long active, unsigned int n)
{
-
return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
}
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
*/
void calc_global_load(unsigned long ticks)
{
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
}
/*
- * Called from update_cpu_load() to periodically update this CPU's
+ * Called from scheduler_tick() to periodically update this CPU's
* active count.
*/
-static void calc_load_account_active(struct rq *this_rq)
+void calc_global_load_tick(struct rq *this_rq)
{
long delta;
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
this_rq->calc_load_update += LOAD_FREQ;
}
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
-{
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-
- sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
- return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
- return rq->load.weight;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long load = get_rq_runnable_load(this_rq);
- unsigned long pending_updates;
-
- /*
- * bail if there's load or we're actually up-to-date.
- */
- if (load || curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- __update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
- struct rq *this_rq = this_rq();
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long pending_updates;
-
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- raw_spin_lock(&this_rq->lock);
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * We were idle, this means load 0, the current load might be
- * !0 due to remote wakeups and the sort.
- */
- __update_cpu_load(this_rq, 0, pending_updates);
- }
- raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
- unsigned long load = get_rq_runnable_load(this_rq);
- /*
- * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
- */
- this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1);
-
- calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f4d4b077eba0..d2ea59364a1c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -6,6 +6,7 @@
#include "sched.h"
#include <linux/slab.h>
+#include <linux/irq_work.h>
int sched_rr_timeslice = RR_TIMESLICE;
@@ -17,19 +18,22 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
struct rt_bandwidth *rt_b =
container_of(timer, struct rt_bandwidth, rt_period_timer);
- ktime_t now;
- int overrun;
int idle = 0;
+ int overrun;
+ raw_spin_lock(&rt_b->rt_runtime_lock);
for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
+ overrun = hrtimer_forward_now(timer, rt_b->rt_period);
if (!overrun)
break;
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
idle = do_sched_rt_period_timer(rt_b, overrun);
+ raw_spin_lock(&rt_b->rt_runtime_lock);
}
+ if (idle)
+ rt_b->rt_period_active = 0;
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -51,15 +55,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
- if (hrtimer_active(&rt_b->rt_period_timer))
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
- start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+ if (!rt_b->rt_period_active) {
+ rt_b->rt_period_active = 1;
+ hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+ hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+ }
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+#ifdef CONFIG_SMP
+static void push_irq_work_func(struct irq_work *work);
+#endif
+
+void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
int i;
@@ -78,7 +87,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
+
+#ifdef HAVE_RT_PUSH_IPI
+ rt_rq->push_flags = 0;
+ rt_rq->push_cpu = nr_cpu_ids;
+ raw_spin_lock_init(&rt_rq->push_lock);
+ init_irq_work(&rt_rq->push_work, push_irq_work_func);
#endif
+#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -193,7 +209,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!rt_se)
goto err_free_rq;
- init_rt_rq(rt_rq, cpu_rq(i));
+ init_rt_rq(rt_rq);
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}
@@ -244,7 +260,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
-static int pull_rt_task(struct rq *this_rq);
+static void pull_rt_task(struct rq *this_rq);
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
@@ -338,13 +354,23 @@ static inline int has_pushable_tasks(struct rq *rq)
return !plist_head_empty(&rq->rt.pushable_tasks);
}
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
+
+static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
{
- /*
- * We detect this state here so that we can avoid taking the RQ
- * lock again later if there is no need to push
- */
- rq->post_schedule = has_pushable_tasks(rq);
+ if (!has_pushable_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+ queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
}
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -396,12 +422,11 @@ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
return false;
}
-static inline int pull_rt_task(struct rq *this_rq)
+static inline void pull_rt_task(struct rq *this_rq)
{
- return 0;
}
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
@@ -1311,7 +1336,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
/*
* If the current task on @p's runqueue is an RT task, then
@@ -1453,7 +1478,15 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
struct rt_rq *rt_rq = &rq->rt;
if (need_pull_rt_task(rq, prev)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we're
+ * being very careful to re-start the picking loop.
+ */
+ lockdep_unpin_lock(&rq->lock);
pull_rt_task(rq);
+ lockdep_pin_lock(&rq->lock);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a dl or stop task can slip in, in which case we need
@@ -1481,7 +1514,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
- set_post_schedule(rq);
+ queue_push_tasks(rq);
return p;
}
@@ -1778,14 +1811,173 @@ static void push_rt_tasks(struct rq *rq)
;
}
-static int pull_rt_task(struct rq *this_rq)
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
+ *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
+ */
+static int rto_next_cpu(struct rq *rq)
+{
+ int prev_cpu = rq->rt.push_cpu;
+ int cpu;
+
+ cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+
+ /*
+ * If the previous cpu is less than the rq's CPU, then it already
+ * passed the end of the mask, and has started from the beginning.
+ * We end if the next CPU is greater or equal to rq's CPU.
+ */
+ if (prev_cpu < rq->cpu) {
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
+
+ } else if (cpu >= nr_cpu_ids) {
+ /*
+ * We passed the end of the mask, start at the beginning.
+ * If the result is greater or equal to the rq's CPU, then
+ * the loop is finished.
+ */
+ cpu = cpumask_first(rq->rd->rto_mask);
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
+ }
+ rq->rt.push_cpu = cpu;
+
+ /* Return cpu to let the caller know if the loop is finished or not */
+ return cpu;
+}
+
+static int find_next_push_cpu(struct rq *rq)
+{
+ struct rq *next_rq;
+ int cpu;
+
+ while (1) {
+ cpu = rto_next_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ break;
+ next_rq = cpu_rq(cpu);
+
+ /* Make sure the next rq can push to this rq */
+ if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ break;
+ }
+
+ return cpu;
+}
+
+#define RT_PUSH_IPI_EXECUTING 1
+#define RT_PUSH_IPI_RESTART 2
+
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu;
+
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ raw_spin_lock(&rq->rt.push_lock);
+ /* Make sure it's still executing */
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ /*
+ * Tell the IPI to restart the loop as things have
+ * changed since it started.
+ */
+ rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+ raw_spin_unlock(&rq->rt.push_lock);
+ return;
+ }
+ raw_spin_unlock(&rq->rt.push_lock);
+ }
+
+ /* When here, there's no IPI going around */
+
+ rq->rt.push_cpu = rq->cpu;
+ cpu = find_next_push_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+
+ irq_work_queue_on(&rq->rt.push_work, cpu);
+}
+
+/* Called from hardirq context */
+static void try_to_push_tasks(void *arg)
{
- int this_cpu = this_rq->cpu, ret = 0, cpu;
+ struct rt_rq *rt_rq = arg;
+ struct rq *rq, *src_rq;
+ int this_cpu;
+ int cpu;
+
+ this_cpu = rt_rq->push_cpu;
+
+ /* Paranoid check */
+ BUG_ON(this_cpu != smp_processor_id());
+
+ rq = cpu_rq(this_cpu);
+ src_rq = rq_of_rt_rq(rt_rq);
+
+again:
+ if (has_pushable_tasks(rq)) {
+ raw_spin_lock(&rq->lock);
+ push_rt_task(rq);
+ raw_spin_unlock(&rq->lock);
+ }
+
+ /* Pass the IPI to the next rt overloaded queue */
+ raw_spin_lock(&rt_rq->push_lock);
+ /*
+ * If the source queue changed since the IPI went out,
+ * we need to restart the search from that CPU again.
+ */
+ if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+ rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+ rt_rq->push_cpu = src_rq->cpu;
+ }
+
+ cpu = find_next_push_cpu(src_rq);
+
+ if (cpu >= nr_cpu_ids)
+ rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+ raw_spin_unlock(&rt_rq->push_lock);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ /*
+ * It is possible that a restart caused this CPU to be
+ * chosen again. Don't bother with an IPI, just see if we
+ * have more to push.
+ */
+ if (unlikely(cpu == rq->cpu))
+ goto again;
+
+ /* Try the next RT overloaded CPU */
+ irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+
+static void push_irq_work_func(struct irq_work *work)
+{
+ struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+
+ try_to_push_tasks(rt_rq);
+}
+#endif /* HAVE_RT_PUSH_IPI */
+
+static void pull_rt_task(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu, cpu;
+ bool resched = false;
struct task_struct *p;
struct rq *src_rq;
if (likely(!rt_overloaded(this_rq)))
- return 0;
+ return;
/*
* Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1793,6 +1985,13 @@ static int pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+#ifdef HAVE_RT_PUSH_IPI
+ if (sched_feat(RT_PUSH_IPI)) {
+ tell_cpu_to_push(this_rq);
+ return;
+ }
+#endif
+
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
@@ -1842,7 +2041,7 @@ static int pull_rt_task(struct rq *this_rq)
if (p->prio < src_rq->curr->prio)
goto skip;
- ret = 1;
+ resched = true;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
@@ -1858,12 +2057,8 @@ skip:
double_unlock_balance(this_rq, src_rq);
}
- return ret;
-}
-
-static void post_schedule_rt(struct rq *rq)
-{
- push_rt_tasks(rq);
+ if (resched)
+ resched_curr(this_rq);
}
/*
@@ -1874,7 +2069,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- has_pushable_tasks(rq) &&
p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
(rq->curr->nr_cpus_allowed < 2 ||
@@ -1882,45 +2076,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
push_rt_tasks(rq);
}
-static void set_cpus_allowed_rt(struct task_struct *p,
- const struct cpumask *new_mask)
-{
- struct rq *rq;
- int weight;
-
- BUG_ON(!rt_task(p));
-
- if (!task_on_rq_queued(p))
- return;
-
- weight = cpumask_weight(new_mask);
-
- /*
- * Only update if the process changes its state from whether it
- * can migrate or not.
- */
- if ((p->nr_cpus_allowed > 1) == (weight > 1))
- return;
-
- rq = task_rq(p);
-
- /*
- * The process used to be able to migrate OR it can now migrate
- */
- if (weight <= 1) {
- if (!task_current(rq, p))
- dequeue_pushable_task(rq, p);
- BUG_ON(!rq->rt.rt_nr_migratory);
- rq->rt.rt_nr_migratory--;
- } else {
- if (!task_current(rq, p))
- enqueue_pushable_task(rq, p);
- rq->rt.rt_nr_migratory++;
- }
-
- update_rt_migration(&rq->rt);
-}
-
/* Assumes rq->lock is held */
static void rq_online_rt(struct rq *rq)
{
@@ -1959,8 +2114,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return;
- if (pull_rt_task(rq))
- resched_curr(rq);
+ queue_pull_task(rq);
}
void __init init_sched_rt_class(void)
@@ -1981,8 +2135,6 @@ void __init init_sched_rt_class(void)
*/
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
- int check_resched = 1;
-
/*
* If we are already running, then there's nothing
* that needs to be done. But if we are not running
@@ -1992,13 +2144,12 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
- /* Don't resched if we changed runqueues */
- push_rt_task(rq) && rq != task_rq(p))
- check_resched = 0;
-#endif /* CONFIG_SMP */
- if (check_resched && p->prio < rq->curr->prio)
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+ queue_push_tasks(rq);
+#else
+ if (p->prio < rq->curr->prio)
resched_curr(rq);
+#endif /* CONFIG_SMP */
}
}
@@ -2019,14 +2170,13 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* may need to pull tasks to this runqueue.
*/
if (oldprio < p->prio)
- pull_rt_task(rq);
+ queue_pull_task(rq);
+
/*
* If there's a higher priority task waiting to run
- * then reschedule. Note, the above pull_rt_task
- * can release the rq lock and p could migrate.
- * Only reschedule if p is still on the same runqueue.
+ * then reschedule.
*/
- if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+ if (p->prio > rq->rt.highest_prio.curr)
resched_curr(rq);
#else
/* For UP simply resched on drop of prio */
@@ -2134,10 +2284,9 @@ const struct sched_class rt_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
- .set_cpus_allowed = set_cpus_allowed_rt,
+ .set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
- .post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc0f435a2779..68cda117574c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/irq_work.h>
#include <linux/tick.h>
#include <linux/slab.h>
@@ -25,8 +26,14 @@ extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
+extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq);
+
+#ifdef CONFIG_SMP
extern void update_cpu_load_active(struct rq *this_rq);
+#else
+static inline void update_cpu_load_active(struct rq *this_rq) { }
+#endif
/*
* Helpers for converting nanosecond timing to jiffy resolution
@@ -130,6 +137,7 @@ struct rt_bandwidth {
ktime_t rt_period;
u64 rt_runtime;
struct hrtimer rt_period_timer;
+ unsigned int rt_period_active;
};
void __dl_clear_params(struct task_struct *p);
@@ -214,7 +222,7 @@ struct cfs_bandwidth {
s64 hierarchical_quota;
u64 runtime_expires;
- int idle, timer_active;
+ int idle, period_active;
struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;
@@ -237,7 +245,6 @@ struct task_group {
#ifdef CONFIG_SMP
atomic_long_t load_avg;
- atomic_t runnable_avg;
#endif
#endif
@@ -305,7 +312,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
extern void free_rt_sched_group(struct task_group *tg);
@@ -358,21 +365,20 @@ struct cfs_rq {
#ifdef CONFIG_SMP
/*
- * CFS Load tracking
- * Under CFS, load is tracked on a per-entity basis and aggregated up.
- * This allows for the description of both thread and group usage (in
- * the FAIR_GROUP_SCHED case).
+ * CFS load tracking
*/
- unsigned long runnable_load_avg, blocked_load_avg;
- atomic64_t decay_counter;
- u64 last_decay;
- atomic_long_t removed_load;
-
+ struct sched_avg avg;
+ u64 runnable_load_sum;
+ unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- /* Required to track per-cpu representation of a task_group */
- u32 tg_runnable_contrib;
- unsigned long tg_load_contrib;
+ unsigned long tg_load_avg_contrib;
+#endif
+ atomic_long_t removed_load_avg, removed_util_avg;
+#ifndef CONFIG_64BIT
+ u64 load_last_update_time_copy;
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* h_load = weight * f(tg)
*
@@ -418,6 +424,11 @@ static inline int rt_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0;
}
+/* RT IPI pull logic requires IRQ_WORK */
+#ifdef CONFIG_IRQ_WORK
+# define HAVE_RT_PUSH_IPI
+#endif
+
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
@@ -435,7 +446,13 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
+#ifdef HAVE_RT_PUSH_IPI
+ int push_flags;
+ int push_cpu;
+ struct irq_work push_work;
+ raw_spinlock_t push_lock;
#endif
+#endif /* CONFIG_SMP */
int rt_queued;
int rt_throttled;
@@ -570,8 +587,6 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-
- struct sched_avg avg;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -597,10 +612,12 @@ struct rq {
struct sched_domain *sd;
unsigned long cpu_capacity;
+ unsigned long cpu_capacity_orig;
+
+ struct callback_head *balance_callback;
unsigned char idle_balance;
/* For active balancing */
- int post_schedule;
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
@@ -688,7 +705,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static inline u64 __rq_clock_broken(struct rq *rq)
{
- return ACCESS_ONCE(rq->clock);
+ return READ_ONCE(rq->clock);
}
static inline u64 rq_clock(struct rq *rq)
@@ -741,6 +758,21 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
#ifdef CONFIG_SMP
+static inline void
+queue_balance_callback(struct rq *rq,
+ struct callback_head *head,
+ void (*func)(struct rq *rq))
+{
+ lockdep_assert_held(&rq->lock);
+
+ if (unlikely(head->next))
+ return;
+
+ head->func = (void (*)(struct callback_head *))func;
+ head->next = rq->balance_callback;
+ rq->balance_callback = head;
+}
+
extern void sched_ttwu_pending(void);
#define rcu_dereference_check_sched_domain(p) \
@@ -807,7 +839,7 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity, capacity_orig;
+ unsigned int capacity;
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
@@ -1023,9 +1055,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev) do { } while (0)
-#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
@@ -1166,7 +1195,6 @@ struct sched_class {
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
- void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1227,6 +1255,8 @@ extern void trigger_load_balance(struct rq *rq);
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+
#else
static inline void idle_enter_fair(struct rq *rq) { }
@@ -1265,7 +1295,6 @@ extern void update_max_interval(void);
extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
-extern void init_sched_dl_class(void);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
@@ -1279,9 +1308,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
-extern void update_idle_cpu_load(struct rq *this_rq);
-
-extern void init_task_runnable_average(struct task_struct *p);
+extern void init_entity_runnable_average(struct sched_entity *se);
static inline void add_nr_running(struct rq *rq, unsigned count)
{
@@ -1368,9 +1395,18 @@ static inline int hrtick_enabled(struct rq *rq)
#ifdef CONFIG_SMP
extern void sched_avg_update(struct rq *rq);
+
+#ifndef arch_scale_freq_capacity
+static __always_inline
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
- rq->rt_avg += rt_delta;
+ rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
sched_avg_update(rq);
}
#else
@@ -1378,8 +1414,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { }
#endif
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -1393,8 +1427,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ lockdep_pin_lock(&rq->lock);
return rq;
+ }
raw_spin_unlock(&rq->lock);
while (unlikely(task_on_rq_migrating(p)))
@@ -1431,8 +1467,10 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
* If we observe the new cpu in task_rq_lock, the acquire will
* pair with the WMB to ensure we must then also see migrating.
*/
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ lockdep_pin_lock(&rq->lock);
return rq;
+ }
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
@@ -1444,6 +1482,7 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
static inline void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
}
@@ -1452,6 +1491,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock)
__releases(p->pi_lock)
{
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
@@ -1638,13 +1678,26 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+
+#ifdef CONFIG_SCHED_DEBUG
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+
+#ifdef CONFIG_NUMA_BALANCING
+extern void
+show_numa_stats(struct task_struct *p, struct seq_file *m);
+extern void
+print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+ unsigned long tpf, unsigned long gsf, unsigned long gpf);
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
-extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
+extern void init_rt_rq(struct rt_rq *rt_rq);
+extern void init_dl_rq(struct dl_rq *dl_rq);
extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339656..b0fbc7632de5 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -47,7 +47,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
# define schedstat_set(var, val) do { } while (0)
#endif
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
static inline void sched_info_reset_dequeued(struct task_struct *t)
{
t->sched_info.last_queued = 0;
@@ -156,7 +156,7 @@ sched_info_switch(struct rq *rq,
#define sched_info_depart(rq, t) do { } while (0)
#define sched_info_arrive(rq, next) do { } while (0)
#define sched_info_switch(rq, t, next) do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+#endif /* CONFIG_SCHED_INFO */
/*
* The following are functions that support scheduler-internal time accounting.
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running))
return false;
/*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.utime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->cputime_atomic.utime);
}
/**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.stime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->cputime_atomic.stime);
}
/**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.sum_exec_runtime += ns;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 79ffec45a6ac..cbc67da10954 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
+ .set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_stop,
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f36..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+ void *key)
{
- __wake_up_common(q, mode, 1, 0, key);
+ __wake_up_common(q, mode, nr, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
else if (waitqueue_active(q))
- __wake_up_locked_key(q, mode, key);
+ __wake_up_locked_key(q, mode, 1, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(abort_exclusive_wait);
@@ -341,7 +342,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
* condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
* an event.
*/
- set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+ smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
return timeout;
}
@@ -354,7 +355,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
* doesn't imply write barrier and the users expects write
* barrier semantics on wakeup functions. The following
* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
- * and is paired with set_mb() in wait_woken().
+ * and is paired with smp_store_mb() in wait_woken().
*/
smp_wmb(); /* C */
wait->flags |= WQ_FLAG_WOKEN;
@@ -601,7 +602,7 @@ EXPORT_SYMBOL(bit_wait_io);
__sched int bit_wait_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
@@ -613,7 +614,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
__sched int bit_wait_io_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4f44028943e6..5bd4779282df 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*/
static u32 seccomp_run_filters(struct seccomp_data *sd)
{
- struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
+ /* Make sure cross-thread synced filter points somewhere sane. */
+ struct seccomp_filter *f =
+ lockless_dereference(current->seccomp.filter);
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
return SECCOMP_RET_KILL;
- /* Make sure cross-thread synced filter points somewhere sane. */
- smp_read_barrier_depends();
-
if (!sd) {
populate_seccomp_data(&sd_local);
sd = &sd_local;
@@ -346,16 +345,13 @@ static inline void seccomp_sync_threads(void)
*/
static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
- struct seccomp_filter *filter;
- unsigned long fp_size;
- struct sock_filter *fp;
- int new_len;
- long ret;
+ struct seccomp_filter *sfilter;
+ int ret;
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
+
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
- fp_size = fprog->len * sizeof(struct sock_filter);
/*
* Installing a seccomp filter requires that the task has
@@ -368,60 +364,21 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
CAP_SYS_ADMIN) != 0)
return ERR_PTR(-EACCES);
- fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
- if (!fp)
- return ERR_PTR(-ENOMEM);
-
- /* Copy the instructions from fprog. */
- ret = -EFAULT;
- if (copy_from_user(fp, fprog->filter, fp_size))
- goto free_prog;
-
- /* Check and rewrite the fprog via the skb checker */
- ret = bpf_check_classic(fp, fprog->len);
- if (ret)
- goto free_prog;
-
- /* Check and rewrite the fprog for seccomp use */
- ret = seccomp_check_filter(fp, fprog->len);
- if (ret)
- goto free_prog;
-
- /* Convert 'sock_filter' insns to 'bpf_insn' insns */
- ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
- if (ret)
- goto free_prog;
-
/* Allocate a new seccomp_filter */
- ret = -ENOMEM;
- filter = kzalloc(sizeof(struct seccomp_filter),
- GFP_KERNEL|__GFP_NOWARN);
- if (!filter)
- goto free_prog;
-
- filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
- if (!filter->prog)
- goto free_filter;
-
- ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
- if (ret)
- goto free_filter_prog;
-
- kfree(fp);
- atomic_set(&filter->usage, 1);
- filter->prog->len = new_len;
+ sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
+ if (!sfilter)
+ return ERR_PTR(-ENOMEM);
- bpf_prog_select_runtime(filter->prog);
+ ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
+ seccomp_check_filter);
+ if (ret < 0) {
+ kfree(sfilter);
+ return ERR_PTR(ret);
+ }
- return filter;
+ atomic_set(&sfilter->usage, 1);
-free_filter_prog:
- __bpf_prog_free(filter->prog);
-free_filter:
- kfree(filter);
-free_prog:
- kfree(fp);
- return ERR_PTR(ret);
+ return sfilter;
}
/**
@@ -591,7 +548,11 @@ void secure_computing_strict(int this_syscall)
{
int mode = current->seccomp.mode;
- if (mode == 0)
+ if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+ return;
+
+ if (mode == SECCOMP_MODE_DISABLED)
return;
else if (mode == SECCOMP_MODE_STRICT)
__secure_computing_strict(this_syscall);
@@ -692,6 +653,10 @@ u32 seccomp_phase1(struct seccomp_data *sd)
int this_syscall = sd ? sd->nr :
syscall_get_nr(current, task_pt_regs(current));
+ if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+ return SECCOMP_PHASE1_OK;
+
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
diff --git a/kernel/signal.c b/kernel/signal.c
index a390499943e4..0f6bbbe77b46 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
* RETURNS:
* %true if @mask is set, %false if made noop because @task was dying.
*/
-bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
{
BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
-void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
{
BUG_ON(mask & ~JOBCTL_PENDING_MASK);
@@ -414,21 +414,16 @@ void flush_sigqueue(struct sigpending *queue)
}
/*
- * Flush all pending signals for a task.
+ * Flush all pending signals for this kthread.
*/
-void __flush_signals(struct task_struct *t)
-{
- clear_tsk_thread_flag(t, TIF_SIGPENDING);
- flush_sigqueue(&t->pending);
- flush_sigqueue(&t->signal->shared_pending);
-}
-
void flush_signals(struct task_struct *t)
{
unsigned long flags;
spin_lock_irqsave(&t->sighand->siglock, flags);
- __flush_signals(t);
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ flush_sigqueue(&t->pending);
+ flush_sigqueue(&t->signal->shared_pending);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
@@ -2000,7 +1995,7 @@ static bool do_signal_stop(int signr)
struct signal_struct *sig = current->signal;
if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
- unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+ unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
struct task_struct *t;
/* signr will be recorded in task->jobctl for retries */
@@ -2753,12 +2748,15 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
- if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
+ if (from->si_signo == SIGBUS &&
+ (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO))
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
#ifdef SEGV_BNDERR
- err |= __put_user(from->si_lower, &to->si_lower);
- err |= __put_user(from->si_upper, &to->si_upper);
+ if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) {
+ err |= __put_user(from->si_lower, &to->si_lower);
+ err |= __put_user(from->si_upper, &to->si_upper);
+ }
#endif
break;
case __SI_CHLD:
@@ -2992,11 +2990,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
- (task_pid_vnr(current) != pid)) {
- /* We used to allow any < 0 si_code */
- WARN_ON_ONCE(info->si_code < 0);
+ (task_pid_vnr(current) != pid))
return -EPERM;
- }
+
info->si_signo = sig;
/* POSIX.1b doesn't mention process groups. */
@@ -3024,7 +3020,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
int, sig,
struct compat_siginfo __user *, uinfo)
{
- siginfo_t info;
+ siginfo_t info = {};
int ret = copy_siginfo_from_user32(&info, uinfo);
if (unlikely(ret))
return ret;
@@ -3041,12 +3037,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
/* Not even root can pretend to send signals from the kernel.
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
- if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
- (task_pid_vnr(current) != pid)) {
- /* We used to allow any < 0 si_code */
- WARN_ON_ONCE(info->si_code < 0);
+ if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+ (task_pid_vnr(current) != pid))
return -EPERM;
- }
+
info->si_signo = sig;
return do_send_specific(tgid, pid, sig, info);
@@ -3070,7 +3064,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
int, sig,
struct compat_siginfo __user *, uinfo)
{
- siginfo_t info;
+ siginfo_t info = {};
if (copy_siginfo_from_user32(&info, uinfo))
return -EFAULT;
diff --git a/kernel/smp.c b/kernel/smp.c
index f38a1e692259..07854477c164 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -19,7 +19,7 @@
enum {
CSD_FLAG_LOCK = 0x01,
- CSD_FLAG_WAIT = 0x02,
+ CSD_FLAG_SYNCHRONOUS = 0x02,
};
struct call_function_data {
@@ -107,7 +107,7 @@ void __init call_function_init(void)
*/
static void csd_lock_wait(struct call_single_data *csd)
{
- while (csd->flags & CSD_FLAG_LOCK)
+ while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
cpu_relax();
}
@@ -121,19 +121,17 @@ static void csd_lock(struct call_single_data *csd)
* to ->flags with any subsequent assignments to other
* fields of the specified call_single_data structure:
*/
- smp_mb();
+ smp_wmb();
}
static void csd_unlock(struct call_single_data *csd)
{
- WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
+ WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
- smp_mb();
-
- csd->flags &= ~CSD_FLAG_LOCK;
+ smp_store_release(&csd->flags, 0);
}
static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
@@ -144,13 +142,16 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
* ->func, ->info, and ->flags set.
*/
static int generic_exec_single(int cpu, struct call_single_data *csd,
- smp_call_func_t func, void *info, int wait)
+ smp_call_func_t func, void *info)
{
- struct call_single_data csd_stack = { .flags = 0 };
- unsigned long flags;
-
-
if (cpu == smp_processor_id()) {
+ unsigned long flags;
+
+ /*
+ * We can unlock early even for the synchronous on-stack case,
+ * since we're doing this from the same CPU..
+ */
+ csd_unlock(csd);
local_irq_save(flags);
func(info);
local_irq_restore(flags);
@@ -158,24 +159,14 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
}
- if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu))
+ if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+ csd_unlock(csd);
return -ENXIO;
-
-
- if (!csd) {
- csd = &csd_stack;
- if (!wait)
- csd = this_cpu_ptr(&csd_data);
}
- csd_lock(csd);
-
csd->func = func;
csd->info = info;
- if (wait)
- csd->flags |= CSD_FLAG_WAIT;
-
/*
* The list addition should be visible before sending the IPI
* handler locks the list to pull the entry off it because of
@@ -190,9 +181,6 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
arch_send_call_function_single_ipi(cpu);
- if (wait)
- csd_lock_wait(csd);
-
return 0;
}
@@ -250,8 +238,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
}
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
- csd->func(csd->info);
- csd_unlock(csd);
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
+
+ /* Do we wait until *after* callback? */
+ if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+ func(info);
+ csd_unlock(csd);
+ } else {
+ csd_unlock(csd);
+ func(info);
+ }
}
/*
@@ -274,6 +271,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
{
+ struct call_single_data *csd;
+ struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
int this_cpu;
int err;
@@ -292,7 +291,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress);
- err = generic_exec_single(cpu, NULL, func, info, wait);
+ csd = &csd_stack;
+ if (!wait) {
+ csd = this_cpu_ptr(&csd_data);
+ csd_lock(csd);
+ }
+
+ err = generic_exec_single(cpu, csd, func, info);
+
+ if (wait)
+ csd_lock_wait(csd);
put_cpu();
@@ -321,7 +329,15 @@ int smp_call_function_single_async(int cpu, struct call_single_data *csd)
int err = 0;
preempt_disable();
- err = generic_exec_single(cpu, csd, csd->func, csd->info, 0);
+
+ /* We could deadlock if we have to wait here with interrupts disabled! */
+ if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK))
+ csd_lock_wait(csd);
+
+ csd->flags = CSD_FLAG_LOCK;
+ smp_wmb();
+
+ err = generic_exec_single(cpu, csd, csd->func, csd->info);
preempt_enable();
return err;
@@ -433,6 +449,8 @@ void smp_call_function_many(const struct cpumask *mask,
struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock(csd);
+ if (wait)
+ csd->flags |= CSD_FLAG_SYNCHRONOUS;
csd->func = func;
csd->info = info;
llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 40190f28db35..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/smp.h>
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>
@@ -112,7 +113,8 @@ static int smpboot_thread_fn(void *data)
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
- if (ht->cleanup)
+ /* cleanup must mirror setup */
+ if (ht->cleanup && td->status != HP_THREAD_NONE)
ht->cleanup(td->cpu, cpu_online(td->cpu));
kfree(td);
return 0;
@@ -231,7 +233,8 @@ void smpboot_unpark_threads(unsigned int cpu)
mutex_lock(&smpboot_threads_lock);
list_for_each_entry(cur, &hotplug_threads, list)
- smpboot_unpark_thread(cur, cpu);
+ if (cpumask_test_cpu(cpu, cur->cpumask))
+ smpboot_unpark_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
}
@@ -270,25 +273,34 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
}
/**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * to hotplug
* @plug_thread: Hotplug thread descriptor
+ * @cpumask: The cpumask where threads run
*
* Creates and starts the threads on all online cpus.
*/
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *cpumask)
{
unsigned int cpu;
int ret = 0;
+ if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
+ return -ENOMEM;
+ cpumask_copy(plug_thread->cpumask, cpumask);
+
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
+ free_cpumask_var(plug_thread->cpumask);
goto out;
}
- smpboot_unpark_thread(plug_thread, cpu);
+ if (cpumask_test_cpu(cpu, cpumask))
+ smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
@@ -296,7 +308,7 @@ out:
put_online_cpus();
return ret;
}
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
/**
* smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
@@ -312,5 +324,204 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
smpboot_destroy_threads(plug_thread);
mutex_unlock(&smpboot_threads_lock);
put_online_cpus();
+ free_cpumask_var(plug_thread->cpumask);
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+
+/**
+ * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
+ * @plug_thread: Hotplug thread descriptor
+ * @new: Revised mask to use
+ *
+ * The cpumask field in the smp_hotplug_thread must not be updated directly
+ * by the client, but only by calling this function.
+ * This function can only be called on a registered smp_hotplug_thread.
+ */
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *new)
+{
+ struct cpumask *old = plug_thread->cpumask;
+ cpumask_var_t tmp;
+ unsigned int cpu;
+
+ if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
+ return -ENOMEM;
+
+ get_online_cpus();
+ mutex_lock(&smpboot_threads_lock);
+
+ /* Park threads that were exclusively enabled on the old mask. */
+ cpumask_andnot(tmp, old, new);
+ for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ smpboot_park_thread(plug_thread, cpu);
+
+ /* Unpark threads that are exclusively enabled on the new mask. */
+ cpumask_andnot(tmp, new, old);
+ for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ smpboot_unpark_thread(plug_thread, cpu);
+
+ cpumask_copy(old, new);
+
+ mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
+
+ free_cpumask_var(tmp);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
+
+static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
+
+/*
+ * Called to poll specified CPU's state, for example, when waiting for
+ * a CPU to come online.
+ */
+int cpu_report_state(int cpu)
+{
+ return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+}
+
+/*
+ * If CPU has died properly, set its state to CPU_UP_PREPARE and
+ * return success. Otherwise, return -EBUSY if the CPU died after
+ * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
+ * if cpu_wait_death() timed out and the CPU still hasn't gotten around
+ * to dying. In the latter two cases, the CPU might not be set up
+ * properly, but it is up to the arch-specific code to decide.
+ * Finally, -EIO indicates an unanticipated problem.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+int cpu_check_up_prepare(int cpu)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+ return 0;
+ }
+
+ switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
+
+ case CPU_POST_DEAD:
+
+ /* The CPU died properly, so just start it up again. */
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+ return 0;
+
+ case CPU_DEAD_FROZEN:
+
+ /*
+ * Timeout during CPU death, so let caller know.
+ * The outgoing CPU completed its processing, but after
+ * cpu_wait_death() timed out and reported the error. The
+ * caller is free to proceed, in which case the state
+ * will be reset properly by cpu_set_state_online().
+ * Proceeding despite this -EBUSY return makes sense
+ * for systems where the outgoing CPUs take themselves
+ * offline, with no post-death manipulation required from
+ * a surviving CPU.
+ */
+ return -EBUSY;
+
+ case CPU_BROKEN:
+
+ /*
+ * The most likely reason we got here is that there was
+ * a timeout during CPU death, and the outgoing CPU never
+ * did complete its processing. This could happen on
+ * a virtualized system if the outgoing VCPU gets preempted
+ * for more than five seconds, and the user attempts to
+ * immediately online that same CPU. Trying again later
+ * might return -EBUSY above, hence -EAGAIN.
+ */
+ return -EAGAIN;
+
+ default:
+
+ /* Should not happen. Famous last words. */
+ return -EIO;
+ }
+}
+
+/*
+ * Mark the specified CPU online.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+void cpu_set_state_online(int cpu)
+{
+ (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Wait for the specified CPU to exit the idle loop and die.
+ */
+bool cpu_wait_death(unsigned int cpu, int seconds)
+{
+ int jf_left = seconds * HZ;
+ int oldstate;
+ bool ret = true;
+ int sleep_jf = 1;
+
+ might_sleep();
+
+ /* The outgoing CPU will normally get done quite quickly. */
+ if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
+ goto update_state;
+ udelay(5);
+
+ /* But if the outgoing CPU dawdles, wait increasingly long times. */
+ while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
+ schedule_timeout_uninterruptible(sleep_jf);
+ jf_left -= sleep_jf;
+ if (jf_left <= 0)
+ break;
+ sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
+ }
+update_state:
+ oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+ if (oldstate == CPU_DEAD) {
+ /* Outgoing CPU died normally, update state. */
+ smp_mb(); /* atomic_read() before update. */
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
+ } else {
+ /* Outgoing CPU still hasn't died, set state accordingly. */
+ if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ oldstate, CPU_BROKEN) != oldstate)
+ goto update_state;
+ ret = false;
+ }
+ return ret;
+}
+
+/*
+ * Called by the outgoing CPU to report its successful death. Return
+ * false if this report follows the surviving CPU's timing out.
+ *
+ * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
+ * timed out. This approach allows architectures to omit calls to
+ * cpu_check_up_prepare() and cpu_set_state_online() without defeating
+ * the next cpu_wait_death()'s polling loop.
+ */
+bool cpu_report_death(void)
+{
+ int oldstate;
+ int newstate;
+ int cpu = smp_processor_id();
+
+ do {
+ oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+ if (oldstate != CPU_BROKEN)
+ newstate = CPU_DEAD;
+ else
+ newstate = CPU_DEAD_FROZEN;
+ } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ oldstate, newstate) != oldstate);
+ return newstate == CPU_DEAD;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 695f0c6cd169..12484e5d5c88 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,13 +35,16 @@ struct cpu_stop_done {
/* the actual stopper, one per every possible cpu, enabled on online cpus */
struct cpu_stopper {
+ struct task_struct *thread;
+
spinlock_t lock;
bool enabled; /* is this stopper enabled? */
struct list_head works; /* list of pending works */
+
+ struct cpu_stop_work stop_work; /* for stop_cpus */
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
-static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;
/*
@@ -74,7 +77,6 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
unsigned long flags;
@@ -82,7 +84,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
if (stopper->enabled) {
list_add_tail(&work->list, &stopper->works);
- wake_up_process(p);
+ wake_up_process(stopper->thread);
} else
cpu_stop_signal_done(work->done, false);
@@ -139,7 +141,7 @@ enum multi_stop_state {
};
struct multi_stop_data {
- int (*fn)(void *);
+ cpu_stop_fn_t fn;
void *data;
/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
unsigned int num_threads;
@@ -211,25 +213,6 @@ static int multi_cpu_stop(void *data)
return err;
}
-struct irq_cpu_stop_queue_work_info {
- int cpu1;
- int cpu2;
- struct cpu_stop_work *work1;
- struct cpu_stop_work *work2;
-};
-
-/*
- * This function is always run with irqs and preemption disabled.
- * This guarantees that both work1 and work2 get queued, before
- * our local migrate thread gets the chance to preempt us.
- */
-static void irq_cpu_stop_queue_work(void *arg)
-{
- struct irq_cpu_stop_queue_work_info *info = arg;
- cpu_stop_queue_work(info->cpu1, info->work1);
- cpu_stop_queue_work(info->cpu2, info->work2);
-}
-
/**
* stop_two_cpus - stops two cpus
* @cpu1: the cpu to stop
@@ -245,7 +228,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
{
struct cpu_stop_done done;
struct cpu_stop_work work1, work2;
- struct irq_cpu_stop_queue_work_info call_args;
struct multi_stop_data msdata;
preempt_disable();
@@ -262,13 +244,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
.done = &done
};
- call_args = (struct irq_cpu_stop_queue_work_info){
- .cpu1 = cpu1,
- .cpu2 = cpu2,
- .work1 = &work1,
- .work2 = &work2,
- };
-
cpu_stop_init_done(&done, 2);
set_state(&msdata, MULTI_STOP_PREPARE);
@@ -285,16 +260,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
return -ENOENT;
}
- lg_local_lock(&stop_cpus_lock);
- /*
- * Queuing needs to be done by the lowest numbered CPU, to ensure
- * that works are always queued in the same order on every CPU.
- * This prevents deadlocks.
- */
- smp_call_function_single(min(cpu1, cpu2),
- &irq_cpu_stop_queue_work,
- &call_args, 1);
- lg_local_unlock(&stop_cpus_lock);
+ lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+ cpu_stop_queue_work(cpu1, &work1);
+ cpu_stop_queue_work(cpu2, &work2);
+ lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
preempt_enable();
wait_for_completion(&done.completion);
@@ -325,7 +295,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
/* static data for stop_cpus */
static DEFINE_MUTEX(stop_cpus_mutex);
-static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
static void queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
@@ -334,22 +303,19 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
struct cpu_stop_work *work;
unsigned int cpu;
- /* initialize works and done */
- for_each_cpu(cpu, cpumask) {
- work = &per_cpu(stop_cpus_work, cpu);
- work->fn = fn;
- work->arg = arg;
- work->done = done;
- }
-
/*
* Disable preemption while queueing to avoid getting
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
lg_global_lock(&stop_cpus_lock);
- for_each_cpu(cpu, cpumask)
- cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
+ for_each_cpu(cpu, cpumask) {
+ work = &per_cpu(cpu_stopper.stop_work, cpu);
+ work->fn = fn;
+ work->arg = arg;
+ work->done = done;
+ cpu_stop_queue_work(cpu, work);
+ }
lg_global_unlock(&stop_cpus_lock);
}
@@ -490,19 +456,21 @@ extern void sched_set_stop_task(int cpu, struct task_struct *stop);
static void cpu_stop_create(unsigned int cpu)
{
- sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
+ sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
}
static void cpu_stop_park(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- struct cpu_stop_work *work;
+ struct cpu_stop_work *work, *tmp;
unsigned long flags;
/* drain remaining works */
spin_lock_irqsave(&stopper->lock, flags);
- list_for_each_entry(work, &stopper->works, list)
+ list_for_each_entry_safe(work, tmp, &stopper->works, list) {
+ list_del_init(&work->list);
cpu_stop_signal_done(work->done, false);
+ }
stopper->enabled = false;
spin_unlock_irqrestore(&stopper->lock, flags);
}
@@ -517,7 +485,7 @@ static void cpu_stop_unpark(unsigned int cpu)
}
static struct smp_hotplug_thread cpu_stop_threads = {
- .store = &cpu_stopper_task,
+ .store = &cpu_stopper.thread,
.thread_should_run = cpu_stop_should_run,
.thread_fn = cpu_stopper_thread,
.thread_comm = "migration/%u",
@@ -547,7 +515,7 @@ early_initcall(cpu_stop_init);
#ifdef CONFIG_STOP_MACHINE
-int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
struct multi_stop_data msdata = {
.fn = fn,
@@ -580,7 +548,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
}
-int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
int ret;
@@ -614,7 +582,7 @@ EXPORT_SYMBOL_GPL(stop_machine);
* 0 if all executions of @fn returned 0, any non zero return value if any
* returned non zero.
*/
-int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
{
struct multi_stop_data msdata = { .fn = fn, .data = data,
diff --git a/kernel/sys.c b/kernel/sys.c
index a03d9cd23ed7..fa2f2f671a5c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -92,10 +92,10 @@
# define SET_TSC_CTL(a) (-EINVAL)
#endif
#ifndef MPX_ENABLE_MANAGEMENT
-# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL)
+# define MPX_ENABLE_MANAGEMENT() (-EINVAL)
#endif
#ifndef MPX_DISABLE_MANAGEMENT
-# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
+# define MPX_DISABLE_MANAGEMENT() (-EINVAL)
#endif
#ifndef GET_FP_MODE
# define GET_FP_MODE(a) (-EINVAL)
@@ -325,6 +325,7 @@ out_unlock:
* SMP: There are not races, the GIDs are checked only by filesystem
* operations (as far as semantic preservation is concerned).
*/
+#ifdef CONFIG_MULTIUSER
SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
{
struct user_namespace *ns = current_user_ns();
@@ -815,6 +816,7 @@ change_okay:
commit_creds(new);
return old_fsgid;
}
+#endif /* CONFIG_MULTIUSER */
/**
* sys_getpid - return the thread group id of the current process
@@ -1647,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask)
return mask;
}
-static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
+ struct file *old_exe, *exe_file;
struct inode *inode;
int err;
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-
exe = fdget(fd);
if (!exe.file)
return -EBADF;
@@ -1667,8 +1668,7 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
* overall picture.
*/
err = -EACCES;
- if (!S_ISREG(inode->i_mode) ||
- exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+ if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
goto exit;
err = inode_permission(inode, MAY_EXEC);
@@ -1678,15 +1678,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
/*
* Forbid mm->exe_file change if old file still mapped.
*/
+ exe_file = get_mm_exe_file(mm);
err = -EBUSY;
- if (mm->exe_file) {
+ if (exe_file) {
struct vm_area_struct *vma;
- for (vma = mm->mmap; vma; vma = vma->vm_next)
- if (vma->vm_file &&
- path_equal(&vma->vm_file->f_path,
- &mm->exe_file->f_path))
- goto exit;
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (path_equal(&vma->vm_file->f_path,
+ &exe_file->f_path))
+ goto exit_err;
+ }
+
+ up_read(&mm->mmap_sem);
+ fput(exe_file);
}
/*
@@ -1700,13 +1707,20 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
goto exit;
err = 0;
- set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
+ /* set the new file, lockless */
+ get_file(exe.file);
+ old_exe = xchg(&mm->exe_file, exe.file);
+ if (old_exe)
+ fput(old_exe);
exit:
fdput(exe);
return err;
+exit_err:
+ up_read(&mm->mmap_sem);
+ fput(exe_file);
+ goto exit;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
/*
* WARNING: we don't require any capability here so be very careful
* in what is allowed for modification from userspace.
@@ -1802,6 +1816,7 @@ out:
return error;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
{
struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
@@ -1838,10 +1853,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
}
- down_write(&mm->mmap_sem);
if (prctl_map.exe_fd != (u32)-1)
- error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
- downgrade_write(&mm->mmap_sem);
+ error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
+ down_read(&mm->mmap_sem);
if (error)
goto out;
@@ -1887,10 +1901,41 @@ out:
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ /*
+ * This doesn't move the auxiliary vector itself since it's pinned to
+ * mm_struct, but it permits filling the vector with new values. It's
+ * up to the caller to provide sane values here, otherwise userspace
+ * tools which use this vector might be unhappy.
+ */
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+
+ if (len > sizeof(user_auxv))
+ return -EINVAL;
+
+ if (copy_from_user(user_auxv, (const void __user *)addr, len))
+ return -EFAULT;
+
+ /* Make sure the last entry is always AT_NULL */
+ user_auxv[AT_VECTOR_SIZE - 2] = 0;
+ user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+ task_lock(current);
+ memcpy(mm->saved_auxv, user_auxv, len);
+ task_unlock(current);
+
+ return 0;
+}
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
struct mm_struct *mm = current->mm;
+ struct prctl_mm_map prctl_map;
struct vm_area_struct *vma;
int error;
@@ -1907,12 +1952,11 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (opt == PR_SET_MM_EXE_FILE) {
- down_write(&mm->mmap_sem);
- error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
- up_write(&mm->mmap_sem);
- return error;
- }
+ if (opt == PR_SET_MM_EXE_FILE)
+ return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+
+ if (opt == PR_SET_MM_AUXV)
+ return prctl_set_auxv(mm, addr, arg4);
if (addr >= TASK_SIZE || addr < mmap_min_addr)
return -EINVAL;
@@ -1922,42 +1966,64 @@ static int prctl_set_mm(int opt, unsigned long addr,
down_read(&mm->mmap_sem);
vma = find_vma(mm, addr);
+ prctl_map.start_code = mm->start_code;
+ prctl_map.end_code = mm->end_code;
+ prctl_map.start_data = mm->start_data;
+ prctl_map.end_data = mm->end_data;
+ prctl_map.start_brk = mm->start_brk;
+ prctl_map.brk = mm->brk;
+ prctl_map.start_stack = mm->start_stack;
+ prctl_map.arg_start = mm->arg_start;
+ prctl_map.arg_end = mm->arg_end;
+ prctl_map.env_start = mm->env_start;
+ prctl_map.env_end = mm->env_end;
+ prctl_map.auxv = NULL;
+ prctl_map.auxv_size = 0;
+ prctl_map.exe_fd = -1;
+
switch (opt) {
case PR_SET_MM_START_CODE:
- mm->start_code = addr;
+ prctl_map.start_code = addr;
break;
case PR_SET_MM_END_CODE:
- mm->end_code = addr;
+ prctl_map.end_code = addr;
break;
case PR_SET_MM_START_DATA:
- mm->start_data = addr;
+ prctl_map.start_data = addr;
break;
case PR_SET_MM_END_DATA:
- mm->end_data = addr;
+ prctl_map.end_data = addr;
+ break;
+ case PR_SET_MM_START_STACK:
+ prctl_map.start_stack = addr;
break;
-
case PR_SET_MM_START_BRK:
- if (addr <= mm->end_data)
- goto out;
-
- if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
- mm->end_data, mm->start_data))
- goto out;
-
- mm->start_brk = addr;
+ prctl_map.start_brk = addr;
break;
-
case PR_SET_MM_BRK:
- if (addr <= mm->end_data)
- goto out;
-
- if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
- mm->end_data, mm->start_data))
- goto out;
-
- mm->brk = addr;
+ prctl_map.brk = addr;
break;
+ case PR_SET_MM_ARG_START:
+ prctl_map.arg_start = addr;
+ break;
+ case PR_SET_MM_ARG_END:
+ prctl_map.arg_end = addr;
+ break;
+ case PR_SET_MM_ENV_START:
+ prctl_map.env_start = addr;
+ break;
+ case PR_SET_MM_ENV_END:
+ prctl_map.env_end = addr;
+ break;
+ default:
+ goto out;
+ }
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ goto out;
+
+ switch (opt) {
/*
* If command line arguments and environment
* are placed somewhere else on stack, we can
@@ -1974,52 +2040,20 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = -EFAULT;
goto out;
}
- if (opt == PR_SET_MM_START_STACK)
- mm->start_stack = addr;
- else if (opt == PR_SET_MM_ARG_START)
- mm->arg_start = addr;
- else if (opt == PR_SET_MM_ARG_END)
- mm->arg_end = addr;
- else if (opt == PR_SET_MM_ENV_START)
- mm->env_start = addr;
- else if (opt == PR_SET_MM_ENV_END)
- mm->env_end = addr;
- break;
-
- /*
- * This doesn't move auxiliary vector itself
- * since it's pinned to mm_struct, but allow
- * to fill vector with new values. It's up
- * to a caller to provide sane values here
- * otherwise user space tools which use this
- * vector might be unhappy.
- */
- case PR_SET_MM_AUXV: {
- unsigned long user_auxv[AT_VECTOR_SIZE];
-
- if (arg4 > sizeof(user_auxv))
- goto out;
- up_read(&mm->mmap_sem);
-
- if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
- return -EFAULT;
-
- /* Make sure the last entry is always AT_NULL */
- user_auxv[AT_VECTOR_SIZE - 2] = 0;
- user_auxv[AT_VECTOR_SIZE - 1] = 0;
-
- BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
-
- task_lock(current);
- memcpy(mm->saved_auxv, user_auxv, arg4);
- task_unlock(current);
-
- return 0;
- }
- default:
- goto out;
}
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
error = 0;
out:
up_read(&mm->mmap_sem);
@@ -2219,12 +2253,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_MPX_ENABLE_MANAGEMENT:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = MPX_ENABLE_MANAGEMENT(me);
+ error = MPX_ENABLE_MANAGEMENT();
break;
case PR_MPX_DISABLE_MANAGEMENT:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = MPX_DISABLE_MANAGEMENT(me);
+ error = MPX_DISABLE_MANAGEMENT();
break;
case PR_SET_FP_MODE:
error = SET_FP_MODE(me, arg2);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5adcb0ae3a58..03c3875d9958 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask);
cond_syscall(sys_ssetmask);
cond_syscall(sys_vm86old);
cond_syscall(sys_vm86);
+cond_syscall(sys_modify_ldt);
cond_syscall(sys_ipc);
cond_syscall(compat_sys_ipc);
cond_syscall(compat_sys_sysctl);
@@ -159,6 +160,20 @@ cond_syscall(sys_uselib);
cond_syscall(sys_fadvise64);
cond_syscall(sys_fadvise64_64);
cond_syscall(sys_madvise);
+cond_syscall(sys_setuid);
+cond_syscall(sys_setregid);
+cond_syscall(sys_setgid);
+cond_syscall(sys_setreuid);
+cond_syscall(sys_setresuid);
+cond_syscall(sys_getresuid);
+cond_syscall(sys_setresgid);
+cond_syscall(sys_getresgid);
+cond_syscall(sys_setgroups);
+cond_syscall(sys_getgroups);
+cond_syscall(sys_setfsuid);
+cond_syscall(sys_setfsgid);
+cond_syscall(sys_capget);
+cond_syscall(sys_capset);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
@@ -204,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
/* performance counters: */
cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ce410bb9f2e1..19b62b522158 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -19,6 +19,7 @@
*/
#include <linux/module.h>
+#include <linux/aio.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
@@ -92,11 +93,9 @@
#include <linux/nmi.h>
#endif
-
#if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
-extern int max_threads;
extern int suid_dumpable;
#ifdef CONFIG_COREDUMP
extern int core_uses_pid;
@@ -350,15 +349,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "timer_migration",
- .data = &sysctl_timer_migration,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
- },
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
@@ -709,10 +699,10 @@ static struct ctl_table kern_table[] = {
#endif
{
.procname = "threads-max",
- .data = &max_threads,
+ .data = NULL,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = sysctl_max_threads,
},
{
.procname = "random",
@@ -846,7 +836,7 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_user_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dowatchdog,
+ .proc_handler = proc_watchdog,
.extra1 = &zero,
.extra2 = &one,
},
@@ -855,11 +845,40 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dowatchdog,
+ .proc_handler = proc_watchdog_thresh,
.extra1 = &zero,
.extra2 = &sixty,
},
{
+ .procname = "nmi_watchdog",
+ .data = &nmi_watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_nmi_watchdog,
+ .extra1 = &zero,
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+ .extra2 = &one,
+#else
+ .extra2 = &zero,
+#endif
+ },
+ {
+ .procname = "soft_watchdog",
+ .data = &soft_watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_soft_watchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "watchdog_cpumask",
+ .data = &watchdog_cpumask_bits,
+ .maxlen = NR_CPUS,
+ .mode = 0644,
+ .proc_handler = proc_watchdog_cpumask,
+ },
+ {
.procname = "softlockup_panic",
.data = &softlockup_panic,
.maxlen = sizeof(int),
@@ -879,15 +898,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif /* CONFIG_SMP */
- {
- .procname = "nmi_watchdog",
- .data = &watchdog_user_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dowatchdog,
- .extra1 = &zero,
- .extra2 = &one,
- },
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
@@ -1120,6 +1130,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ {
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = timer_migration_handler,
+ },
+#endif
{ }
};
@@ -1321,6 +1340,15 @@ static struct ctl_table vm_table[] = {
.extra1 = &min_extfrag_threshold,
.extra2 = &max_extfrag_threshold,
},
+ {
+ .procname = "compact_unevictable_allowed",
+ .data = &sysctl_compact_unevictable_allowed,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif /* CONFIG_COMPACTION */
{
@@ -1510,12 +1538,6 @@ static struct ctl_table vm_table[] = {
{ }
};
-#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
-static struct ctl_table binfmt_misc_table[] = {
- { }
-};
-#endif
-
static struct ctl_table fs_table[] = {
{
.procname = "inode-nr",
@@ -1669,7 +1691,7 @@ static struct ctl_table fs_table[] = {
{
.procname = "binfmt_misc",
.mode = 0555,
- .child = binfmt_misc_table,
+ .child = sysctl_mount_point,
},
#endif
{
@@ -1960,7 +1982,15 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int write, void *data)
{
if (write) {
- *valp = *negp ? -*lvalp : *lvalp;
+ if (*negp) {
+ if (*lvalp > (unsigned long) INT_MAX + 1)
+ return -EINVAL;
+ *valp = -*lvalp;
+ } else {
+ if (*lvalp > (unsigned long) INT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ }
} else {
int val = *valp;
if (val < 0) {
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
deleted file mode 100644
index 3e9868d47535..000000000000
--- a/kernel/system_certificates.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <linux/export.h>
-#include <linux/init.h>
-
- __INITRODATA
-
- .align 8
- .globl VMLINUX_SYMBOL(system_certificate_list)
-VMLINUX_SYMBOL(system_certificate_list):
-__cert_list_start:
- .incbin "kernel/x509_certificate_list"
-__cert_list_end:
-
- .align 8
- .globl VMLINUX_SYMBOL(system_certificate_list_size)
-VMLINUX_SYMBOL(system_certificate_list_size):
-#ifdef CONFIG_64BIT
- .quad __cert_list_end - __cert_list_start
-#else
- .long __cert_list_end - __cert_list_start
-#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
deleted file mode 100644
index 875f64e8935b..000000000000
--- a/kernel/system_keyring.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/* System trusted keyring for trusted public keys
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <keys/asymmetric-type.h>
-#include <keys/system_keyring.h>
-#include "module-internal.h"
-
-struct key *system_trusted_keyring;
-EXPORT_SYMBOL_GPL(system_trusted_keyring);
-
-extern __initconst const u8 system_certificate_list[];
-extern __initconst const unsigned long system_certificate_list_size;
-
-/*
- * Load the compiled-in keys
- */
-static __init int system_trusted_keyring_init(void)
-{
- pr_notice("Initialise system trusted keyring\n");
-
- system_trusted_keyring =
- keyring_alloc(".system_keyring",
- KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
- ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
- if (IS_ERR(system_trusted_keyring))
- panic("Can't allocate system trusted keyring\n");
-
- set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
- return 0;
-}
-
-/*
- * Must be initialised before we try and load the keys into the keyring.
- */
-device_initcall(system_trusted_keyring_init);
-
-/*
- * Load the compiled-in list of X.509 certificates.
- */
-static __init int load_system_certificate_list(void)
-{
- key_ref_t key;
- const u8 *p, *end;
- size_t plen;
-
- pr_notice("Loading compiled-in X.509 certificates\n");
-
- p = system_certificate_list;
- end = p + system_certificate_list_size;
- while (p < end) {
- /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
- * than 256 bytes in size.
- */
- if (end - p < 4)
- goto dodgy_cert;
- if (p[0] != 0x30 &&
- p[1] != 0x82)
- goto dodgy_cert;
- plen = (p[2] << 8) | p[3];
- plen += 4;
- if (plen > end - p)
- goto dodgy_cert;
-
- key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
- "asymmetric",
- NULL,
- p,
- plen,
- ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ),
- KEY_ALLOC_NOT_IN_QUOTA |
- KEY_ALLOC_TRUSTED);
- if (IS_ERR(key)) {
- pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
- PTR_ERR(key));
- } else {
- set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
- pr_notice("Loaded X.509 cert '%s'\n",
- key_ref_to_ptr(key)->description);
- key_ref_put(key);
- }
- p += plen;
- }
-
- return 0;
-
-dodgy_cert:
- pr_err("Problem parsing in-kernel X.509 certificate list\n");
- return 0;
-}
-late_initcall(load_system_certificate_list);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 8727032e3a6f..53fa971d000d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* This is like the signal handler which runs in kernel mode, but it doesn't
* try to wake up the @task.
*
+ * Note: there is no ordering guarantee on works queued here.
+ *
* RETURNS:
* 0 if succeeds or -ESRCH.
*/
@@ -108,16 +110,6 @@ void task_work_run(void)
raw_spin_unlock_wait(&task->pi_lock);
smp_mb();
- /* Reverse the list to run the works in fifo order */
- head = NULL;
- do {
- next = work->next;
- work->next = head;
- head = work;
- work = next;
- } while (work);
-
- work = head;
do {
next = work->next;
work->func(work);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d626dc98e8df..4008d9f95dd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET
config GENERIC_CLOCKEVENTS
bool
-# Migration helper. Builds, but does not invoke
-config GENERIC_CLOCKEVENTS_BUILD
- bool
- default y
- depends on GENERIC_CLOCKEVENTS
-
# Architecture can handle broadcast in a driver-agnostic way
config ARCH_HAS_TICK_BROADCAST
bool
@@ -98,12 +92,10 @@ config NO_HZ_FULL
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
# We need at least one periodic CPU for timekeeping
depends on SMP
- # RCU_USER_QS dependency
depends on HAVE_CONTEXT_TRACKING
# VIRT_CPU_ACCOUNTING_GEN dependency
depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
select NO_HZ_COMMON
- select RCU_USER_QS
select RCU_NOCB_CPU
select VIRT_CPU_ACCOUNTING_GEN
select IRQ_WORK
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index c09c07817d7a..49eca0beed32 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -2,32 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o
ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
obj-y += tick-broadcast.o
obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
-obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
-obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
-
-$(obj)/time.o: $(obj)/timeconst.h
-
-quiet_cmd_hzfile = HZFILE $@
- cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
-
-targets += hz.bc
-$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
- $(call if_changed,hzfile)
-
-quiet_cmd_bc = BC $@
- cmd_bc = bc -q $(filter-out FORCE,$^) > $@
-
-targets += timeconst.h
-$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
- $(call if_changed,bc)
-
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 1b001ed1edb9..7fbba635a549 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init);
* @alarm: ptr to alarm to set
* @start: time to run the alarm
*/
-int alarm_start(struct alarm *alarm, ktime_t start)
+void alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
- int ret;
spin_lock_irqsave(&base->lock, flags);
alarm->node.expires = start;
alarmtimer_enqueue(base, alarm);
- ret = hrtimer_start(&alarm->timer, alarm->node.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
- return ret;
}
EXPORT_SYMBOL_GPL(alarm_start);
@@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start);
* @alarm: ptr to alarm to set
* @start: time relative to now to run the alarm
*/
-int alarm_start_relative(struct alarm *alarm, ktime_t start)
+void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
start = ktime_add(start, base->gettime());
- return alarm_start(alarm, start);
+ alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -495,12 +492,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
*/
static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
{
- clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
-
if (!alarmtimer_get_rtcdev())
return -EINVAL;
- return hrtimer_get_res(baseid, tp);
+ tp->tv_sec = 0;
+ tp->tv_nsec = hrtimer_resolution;
+ return 0;
}
/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 55449909f114..50eb107f1198 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,25 +94,90 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);
+static int __clockevents_switch_state(struct clock_event_device *dev,
+ enum clock_event_state state)
+{
+ /* Transition with legacy set_mode() callback */
+ if (dev->set_mode) {
+ /* Legacy callback doesn't support new modes */
+ if (state > CLOCK_EVT_STATE_ONESHOT)
+ return -ENOSYS;
+ /*
+ * 'clock_event_state' and 'clock_event_mode' have 1-to-1
+ * mapping until *_ONESHOT, and so a simple cast will work.
+ */
+ dev->set_mode((enum clock_event_mode)state, dev);
+ dev->mode = (enum clock_event_mode)state;
+ return 0;
+ }
+
+ if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ /* Transition with new state-specific callbacks */
+ switch (state) {
+ case CLOCK_EVT_STATE_DETACHED:
+ /* The clockevent device is getting replaced. Shut it down. */
+
+ case CLOCK_EVT_STATE_SHUTDOWN:
+ if (dev->set_state_shutdown)
+ return dev->set_state_shutdown(dev);
+ return 0;
+
+ case CLOCK_EVT_STATE_PERIODIC:
+ /* Core internal bug */
+ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+ return -ENOSYS;
+ if (dev->set_state_periodic)
+ return dev->set_state_periodic(dev);
+ return 0;
+
+ case CLOCK_EVT_STATE_ONESHOT:
+ /* Core internal bug */
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return -ENOSYS;
+ if (dev->set_state_oneshot)
+ return dev->set_state_oneshot(dev);
+ return 0;
+
+ case CLOCK_EVT_STATE_ONESHOT_STOPPED:
+ /* Core internal bug */
+ if (WARN_ONCE(!clockevent_state_oneshot(dev),
+ "Current state: %d\n",
+ clockevent_get_state(dev)))
+ return -EINVAL;
+
+ if (dev->set_state_oneshot_stopped)
+ return dev->set_state_oneshot_stopped(dev);
+ else
+ return -ENOSYS;
+
+ default:
+ return -ENOSYS;
+ }
+}
+
/**
- * clockevents_set_mode - set the operating mode of a clock event device
+ * clockevents_switch_state - set the operating state of a clock event device
* @dev: device to modify
- * @mode: new mode
+ * @state: new state
*
* Must be called with interrupts disabled !
*/
-void clockevents_set_mode(struct clock_event_device *dev,
- enum clock_event_mode mode)
+void clockevents_switch_state(struct clock_event_device *dev,
+ enum clock_event_state state)
{
- if (dev->mode != mode) {
- dev->set_mode(mode, dev);
- dev->mode = mode;
+ if (clockevent_get_state(dev) != state) {
+ if (__clockevents_switch_state(dev, state))
+ return;
+
+ clockevent_set_state(dev, state);
/*
* A nsec2cyc multiplicator of 0 is invalid and we'd crash
* on it, so fix it up and emit a warning:
*/
- if (mode == CLOCK_EVT_MODE_ONESHOT) {
+ if (clockevent_state_oneshot(dev)) {
if (unlikely(!dev->mult)) {
dev->mult = 1;
WARN_ON(1);
@@ -127,10 +192,28 @@ void clockevents_set_mode(struct clock_event_device *dev,
*/
void clockevents_shutdown(struct clock_event_device *dev)
{
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event.tv64 = KTIME_MAX;
}
+/**
+ * clockevents_tick_resume - Resume the tick device before using it again
+ * @dev: device to resume
+ */
+int clockevents_tick_resume(struct clock_event_device *dev)
+{
+ int ret = 0;
+
+ if (dev->set_mode) {
+ dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
+ dev->mode = CLOCK_EVT_MODE_RESUME;
+ } else if (dev->tick_resume) {
+ ret = dev->tick_resume(dev);
+ }
+
+ return ret;
+}
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
/* Limit min_delta to a jiffie */
@@ -183,7 +266,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (clockevent_state_shutdown(dev))
return 0;
dev->retries++;
@@ -220,7 +303,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (clockevent_state_shutdown(dev))
return 0;
dev->retries++;
@@ -252,9 +335,13 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
dev->next_event = expires;
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (clockevent_state_shutdown(dev))
return 0;
+ /* We must be in ONESHOT state here */
+ WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
+ clockevent_get_state(dev));
+
/* Shortcut for clockevent devices that can deal with ktime. */
if (dev->features & CLOCK_EVT_FEAT_KTIME)
return dev->set_next_ktime(expires, dev);
@@ -297,7 +384,7 @@ static int clockevents_replace(struct clock_event_device *ced)
struct clock_event_device *dev, *newdev = NULL;
list_for_each_entry(dev, &clockevent_devices, list) {
- if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
+ if (dev == ced || !clockevent_state_detached(dev))
continue;
if (!tick_check_replacement(newdev, dev))
@@ -323,7 +410,7 @@ static int clockevents_replace(struct clock_event_device *ced)
static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
{
/* Fast track. Device is unused */
- if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
+ if (clockevent_state_detached(ced)) {
list_del_init(&ced->list);
return 0;
}
@@ -371,7 +458,27 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
mutex_unlock(&clockevents_mutex);
return ret;
}
-EXPORT_SYMBOL_GPL(clockevents_unbind);
+EXPORT_SYMBOL_GPL(clockevents_unbind_device);
+
+/* Sanity check of state transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+ /* Legacy set_mode() callback */
+ if (dev->set_mode) {
+ /* We shouldn't be supporting new modes now */
+ WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
+ dev->set_state_shutdown || dev->tick_resume ||
+ dev->set_state_oneshot_stopped);
+
+ BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ return 0;
+ }
+
+ if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ return 0;
+}
/**
* clockevents_register_device - register a clock event device
@@ -381,7 +488,11 @@ void clockevents_register_device(struct clock_event_device *dev)
{
unsigned long flags;
- BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ BUG_ON(clockevents_sanity_check(dev));
+
+ /* Initialize state to DETACHED */
+ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
+
if (!dev->cpumask) {
WARN_ON(num_possible_cpus() > 1);
dev->cpumask = cpumask_of(smp_processor_id());
@@ -445,11 +556,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
clockevents_config(dev, freq);
- if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
+ if (clockevent_state_oneshot(dev))
return clockevents_program_event(dev, dev->next_event, false);
- if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
- dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+ if (clockevent_state_periodic(dev))
+ return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
return 0;
}
@@ -491,30 +602,27 @@ void clockevents_handle_noop(struct clock_event_device *dev)
* @old: device to release (can be NULL)
* @new: device to request (can be NULL)
*
- * Called from the notifier chain. clockevents_lock is held already
+ * Called from various tick functions with clockevents_lock held and
+ * interrupts disabled.
*/
void clockevents_exchange_device(struct clock_event_device *old,
struct clock_event_device *new)
{
- unsigned long flags;
-
- local_irq_save(flags);
/*
* Caller releases a clock event device. We queue it into the
* released list and do a notify add later.
*/
if (old) {
module_put(old->owner);
- clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+ clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED);
list_del(&old->list);
list_add(&old->list, &clockevents_released);
}
if (new) {
- BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+ BUG_ON(!clockevent_state_detached(new));
clockevents_shutdown(new);
}
- local_irq_restore(flags);
}
/**
@@ -525,7 +633,7 @@ void clockevents_suspend(void)
struct clock_event_device *dev;
list_for_each_entry_reverse(dev, &clockevent_devices, list)
- if (dev->suspend)
+ if (dev->suspend && !clockevent_state_detached(dev))
dev->suspend(dev);
}
@@ -537,78 +645,44 @@ void clockevents_resume(void)
struct clock_event_device *dev;
list_for_each_entry(dev, &clockevent_devices, list)
- if (dev->resume)
+ if (dev->resume && !clockevent_state_detached(dev))
dev->resume(dev);
}
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
+#ifdef CONFIG_HOTPLUG_CPU
/**
- * clockevents_notify - notification about relevant events
- * Returns 0 on success, any other value on error
+ * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
*/
-int clockevents_notify(unsigned long reason, void *arg)
+void tick_cleanup_dead_cpu(int cpu)
{
struct clock_event_device *dev, *tmp;
unsigned long flags;
- int cpu, ret = 0;
raw_spin_lock_irqsave(&clockevents_lock, flags);
- switch (reason) {
- case CLOCK_EVT_NOTIFY_BROADCAST_ON:
- case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
- tick_broadcast_on_off(reason, arg);
- break;
-
- case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
- case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
- ret = tick_broadcast_oneshot_control(reason);
- break;
-
- case CLOCK_EVT_NOTIFY_CPU_DYING:
- tick_handover_do_timer(arg);
- break;
-
- case CLOCK_EVT_NOTIFY_SUSPEND:
- tick_suspend();
- tick_suspend_broadcast();
- break;
-
- case CLOCK_EVT_NOTIFY_RESUME:
- tick_resume();
- break;
-
- case CLOCK_EVT_NOTIFY_CPU_DEAD:
- tick_shutdown_broadcast_oneshot(arg);
- tick_shutdown_broadcast(arg);
- tick_shutdown(arg);
- /*
- * Unregister the clock event devices which were
- * released from the users in the notify chain.
- */
- list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+ tick_shutdown_broadcast_oneshot(cpu);
+ tick_shutdown_broadcast(cpu);
+ tick_shutdown(cpu);
+ /*
+ * Unregister the clock event devices which were
+ * released from the users in the notify chain.
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+ list_del(&dev->list);
+ /*
+ * Now check whether the CPU has left unused per cpu devices
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
+ if (cpumask_test_cpu(cpu, dev->cpumask) &&
+ cpumask_weight(dev->cpumask) == 1 &&
+ !tick_is_broadcast_device(dev)) {
+ BUG_ON(!clockevent_state_detached(dev));
list_del(&dev->list);
- /*
- * Now check whether the CPU has left unused per cpu devices
- */
- cpu = *((int *)arg);
- list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
- if (cpumask_test_cpu(cpu, dev->cpumask) &&
- cpumask_weight(dev->cpumask) == 1 &&
- !tick_is_broadcast_device(dev)) {
- BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
- list_del(&dev->list);
- }
}
- break;
- default:
- break;
}
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
- return ret;
}
-EXPORT_SYMBOL_GPL(clockevents_notify);
+#endif
#ifdef CONFIG_SYSFS
struct bus_type clockevents_subsys = {
@@ -727,5 +801,3 @@ static int __init clockevents_init_sysfs(void)
}
device_initcall(clockevents_init_sysfs);
#endif /* SYSFS */
-
-#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4892352f0e49..841b72f720e8 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,6 +23,8 @@
* o Allow clocksource drivers to be unregistered
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/device.h>
#include <linux/clocksource.h>
#include <linux/init.h>
@@ -142,13 +144,6 @@ static void __clocksource_unstable(struct clocksource *cs)
schedule_work(&watchdog_work);
}
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
-{
- printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
- cs->name, delta);
- __clocksource_unstable(cs);
-}
-
/**
* clocksource_mark_unstable - mark clocksource unstable via watchdog
* @cs: clocksource to be marked unstable
@@ -174,7 +169,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs;
- cycle_t csnow, wdnow, delta;
+ cycle_t csnow, wdnow, cslast, wdlast, delta;
int64_t wd_nsec, cs_nsec;
int next_cpu, reset_pending;
@@ -213,6 +208,8 @@ static void clocksource_watchdog(unsigned long data)
delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ wdlast = cs->wd_last; /* save these in case we print them */
+ cslast = cs->cs_last;
cs->cs_last = csnow;
cs->wd_last = wdnow;
@@ -221,7 +218,13 @@ static void clocksource_watchdog(unsigned long data)
/* Check the deviation from the watchdog clocksource. */
if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
- clocksource_unstable(cs, cs_nsec - wd_nsec);
+ pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
+ cs->name);
+ pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+ watchdog->name, wdnow, wdlast, watchdog->mask);
+ pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+ cs->name, csnow, cslast, cs->mask);
+ __clocksource_unstable(cs);
continue;
}
@@ -469,26 +472,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
* @shift: cycle to nanosecond divisor (power of two)
* @maxadj: maximum adjustment value to mult (~11%)
* @mask: bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc: maximum cycle value before potential overflow (does not include
+ * any safety margin)
+ *
+ * NOTE: This function includes a safety margin of 50%, in other words, we
+ * return half the number of nanoseconds the hardware counter can technically
+ * cover. This is done so that we can potentially detect problems caused by
+ * delayed timers or bad hardware, which might result in time intervals that
+ * are larger then what the math used can handle without overflows.
*/
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
{
u64 max_nsecs, max_cycles;
/*
* Calculate the maximum number of cycles that we can pass to the
- * cyc2ns function without overflowing a 64-bit signed result. The
- * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
- * which is equivalent to the below.
- * max_cycles < (2^63)/(mult + maxadj)
- * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
- * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
- * max_cycles < 2^(63 - log2(mult + maxadj))
- * max_cycles < 1 << (63 - log2(mult + maxadj))
- * Please note that we add 1 to the result of the log2 to account for
- * any rounding errors, ensure the above inequality is satisfied and
- * no overflow will occur.
+ * cyc2ns() function without overflowing a 64-bit result.
*/
- max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+ max_cycles = ULLONG_MAX;
+ do_div(max_cycles, mult+maxadj);
/*
* The actual maximum number of cycles we can defer the clocksource is
@@ -499,27 +501,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
max_cycles = min(max_cycles, mask);
max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
+ /* return the max_cycles value as well if requested */
+ if (max_cyc)
+ *max_cyc = max_cycles;
+
+ /* Return 50% of the actual maximum, so we can detect bad values */
+ max_nsecs >>= 1;
+
return max_nsecs;
}
/**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs: Pointer to clocksource
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
+ * @cs: Pointer to clocksource to be updated
*
*/
-static u64 clocksource_max_deferment(struct clocksource *cs)
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
{
- u64 max_nsecs;
-
- max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
- cs->mask);
- /*
- * To ensure that the clocksource does not wrap whilst we are idle,
- * limit the time the clocksource can be deferred by 12.5%. Please
- * note a margin of 12.5% is used because this can be computed with
- * a shift, versus say 10% which would require division.
- */
- return max_nsecs - (max_nsecs >> 3);
+ cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+ cs->maxadj, cs->mask,
+ &cs->max_cycles);
}
#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -569,9 +570,8 @@ static void __clocksource_select(bool skipcur)
*/
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
/* Override clocksource cannot be used. */
- printk(KERN_WARNING "Override clocksource %s is not "
- "HRT compatible. Cannot switch while in "
- "HRT/NOHZ mode\n", cs->name);
+ pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
+ cs->name);
override_name[0] = 0;
} else
/* Override clocksource can be used. */
@@ -648,7 +648,7 @@ static void clocksource_enqueue(struct clocksource *cs)
}
/**
- * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
* @cs: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
@@ -656,48 +656,64 @@ static void clocksource_enqueue(struct clocksource *cs)
* This should only be called from the clocksource->enable() method.
*
* This *SHOULD NOT* be called directly! Please use the
- * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
*/
-void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
u64 sec;
+
/*
- * Calc the maximum number of seconds which we can run before
- * wrapping around. For clocksources which have a mask > 32bit
- * we need to limit the max sleep time to have a good
- * conversion precision. 10 minutes is still a reasonable
- * amount. That results in a shift value of 24 for a
- * clocksource with mask >= 40bit and f >= 4GHz. That maps to
- * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
- * margin as we do in clocksource_max_deferment()
+ * Default clocksources are *special* and self-define their mult/shift.
+ * But, you're not special, so you should specify a freq value.
*/
- sec = (cs->mask - (cs->mask >> 3));
- do_div(sec, freq);
- do_div(sec, scale);
- if (!sec)
- sec = 1;
- else if (sec > 600 && cs->mask > UINT_MAX)
- sec = 600;
-
- clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
- NSEC_PER_SEC / scale, sec * scale);
-
+ if (freq) {
+ /*
+ * Calc the maximum number of seconds which we can run before
+ * wrapping around. For clocksources which have a mask > 32-bit
+ * we need to limit the max sleep time to have a good
+ * conversion precision. 10 minutes is still a reasonable
+ * amount. That results in a shift value of 24 for a
+ * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
+ * ~ 0.06ppm granularity for NTP.
+ */
+ sec = cs->mask;
+ do_div(sec, freq);
+ do_div(sec, scale);
+ if (!sec)
+ sec = 1;
+ else if (sec > 600 && cs->mask > UINT_MAX)
+ sec = 600;
+
+ clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+ NSEC_PER_SEC / scale, sec * scale);
+ }
/*
- * for clocksources that have large mults, to avoid overflow.
- * Since mult may be adjusted by ntp, add an safety extra margin
- *
+ * Ensure clocksources that have large 'mult' values don't overflow
+ * when adjusted.
*/
cs->maxadj = clocksource_max_adjustment(cs);
- while ((cs->mult + cs->maxadj < cs->mult)
- || (cs->mult - cs->maxadj > cs->mult)) {
+ while (freq && ((cs->mult + cs->maxadj < cs->mult)
+ || (cs->mult - cs->maxadj > cs->mult))) {
cs->mult >>= 1;
cs->shift--;
cs->maxadj = clocksource_max_adjustment(cs);
}
- cs->max_idle_ns = clocksource_max_deferment(cs);
+ /*
+ * Only warn for *special* clocksources that self-define
+ * their mult/shift values and don't specify a freq.
+ */
+ WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+ "timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+ cs->name);
+
+ clocksource_update_max_deferment(cs);
+
+ pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+ cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
}
-EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
/**
* __clocksource_register_scale - Used to install new clocksources
@@ -714,7 +730,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
/* Initialize mult/shift and max_idle_ns */
- __clocksource_updatefreq_scale(cs, scale, freq);
+ __clocksource_update_freq_scale(cs, scale, freq);
/* Add clocksource to the clocksource list */
mutex_lock(&clocksource_mutex);
@@ -726,33 +742,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);
-
-/**
- * clocksource_register - Used to install new clocksources
- * @cs: clocksource to be registered
- *
- * Returns -EBUSY if registration fails, zero otherwise.
- */
-int clocksource_register(struct clocksource *cs)
-{
- /* calculate max adjustment for given mult/shift */
- cs->maxadj = clocksource_max_adjustment(cs);
- WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
- "Clocksource %s might overflow on 11%% adjustment\n",
- cs->name);
-
- /* calculate max idle time permitted for this clocksource */
- cs->max_idle_ns = clocksource_max_deferment(cs);
-
- mutex_lock(&clocksource_mutex);
- clocksource_enqueue(cs);
- clocksource_enqueue_watchdog(cs);
- clocksource_select();
- mutex_unlock(&clocksource_mutex);
- return 0;
-}
-EXPORT_SYMBOL(clocksource_register);
-
static void __clocksource_change_rating(struct clocksource *cs, int rating)
{
list_del(&cs->list);
@@ -1021,12 +1010,10 @@ __setup("clocksource=", boot_override_clocksource);
static int __init boot_override_clock(char* str)
{
if (!strcmp(str, "pmtmr")) {
- printk("Warning: clock=pmtmr is deprecated. "
- "Use clocksource=acpi_pm.\n");
+ pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
return boot_override_clocksource("acpi_pm");
}
- printk("Warning! clock= boot option is deprecated. "
- "Use clocksource=xyz\n");
+ pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
return boot_override_clocksource(str);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bee0c1f78091..457a373e2181 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,7 +54,7 @@
#include <trace/events/timer.h>
-#include "timekeeping.h"
+#include "tick-internal.h"
/*
* The timer bases:
@@ -66,33 +66,29 @@
*/
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
-
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+ .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
.index = HRTIMER_BASE_MONOTONIC,
.clockid = CLOCK_MONOTONIC,
.get_time = &ktime_get,
- .resolution = KTIME_LOW_RES,
},
{
.index = HRTIMER_BASE_REALTIME,
.clockid = CLOCK_REALTIME,
.get_time = &ktime_get_real,
- .resolution = KTIME_LOW_RES,
},
{
.index = HRTIMER_BASE_BOOTTIME,
.clockid = CLOCK_BOOTTIME,
.get_time = &ktime_get_boottime,
- .resolution = KTIME_LOW_RES,
},
{
.index = HRTIMER_BASE_TAI,
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
- .resolution = KTIME_LOW_RES,
},
}
};
@@ -109,27 +105,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
return hrtimer_clock_to_base_table[clock_id];
}
-
-/*
- * Get the coarse grained time at the softirq based on xtime and
- * wall_to_monotonic.
- */
-static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
-{
- ktime_t xtim, mono, boot, tai;
- ktime_t off_real, off_boot, off_tai;
-
- mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
- boot = ktime_add(mono, off_boot);
- xtim = ktime_add(mono, off_real);
- tai = ktime_add(mono, off_tai);
-
- base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
- base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
- base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
- base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
-}
-
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -137,6 +112,18 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
#ifdef CONFIG_SMP
/*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+ .seq = SEQCNT_ZERO(migration_cpu_base),
+ .clock_base = { { .cpu_base = &migration_cpu_base, }, },
+};
+
+#define migration_base migration_cpu_base.clock_base[0]
+
+/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
* locked, and the base itself is locked too.
@@ -145,8 +132,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
* be found on the lists/queues.
*
* When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
*/
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -156,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
for (;;) {
base = timer->base;
- if (likely(base != NULL)) {
+ if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
return base;
@@ -190,21 +177,47 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
#endif
}
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+ int pinned)
+{
+ if (pinned || !base->migration_enabled)
+ return base;
+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+}
+#else
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+ int pinned)
+{
+ return base;
+}
+#endif
+
/*
- * Switch the timer base to the current CPU when possible.
+ * We switch the timer base to a power-optimized selected CPU target,
+ * if:
+ * - NO_HZ_COMMON is enabled
+ * - timer migration is enabled
+ * - the timer callback is not running
+ * - the timer is not the first expiring timer on the new target
+ *
+ * If one of the above requirements is not fulfilled we move the timer
+ * to the current CPU or leave it on the previously assigned CPU if
+ * the timer callback is currently running.
*/
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
int pinned)
{
+ struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
struct hrtimer_clock_base *new_base;
- struct hrtimer_cpu_base *new_cpu_base;
- int this_cpu = smp_processor_id();
- int cpu = get_nohz_timer_target(pinned);
int basenum = base->index;
+ this_cpu_base = this_cpu_ptr(&hrtimer_bases);
+ new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
- new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[basenum];
if (base != new_base) {
@@ -220,22 +233,24 @@ again:
if (unlikely(hrtimer_callback_running(timer)))
return base;
- /* See the comment in lock_timer_base() */
- timer->base = NULL;
+ /* See the comment in lock_hrtimer_base() */
+ timer->base = &migration_base;
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
- cpu = this_cpu;
+ if (new_cpu_base != this_cpu_base &&
+ hrtimer_check_target(timer, new_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
+ new_cpu_base = this_cpu_base;
timer->base = base;
goto again;
}
timer->base = new_base;
} else {
- if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
- cpu = this_cpu;
+ if (new_cpu_base != this_cpu_base &&
+ hrtimer_check_target(timer, new_base)) {
+ new_cpu_base = this_cpu_base;
goto again;
}
}
@@ -266,21 +281,23 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
/*
* Divide a ktime value by a nanosecond value
*/
-u64 __ktime_divns(const ktime_t kt, s64 div)
+s64 __ktime_divns(const ktime_t kt, s64 div)
{
- u64 dclc;
int sft = 0;
+ s64 dclc;
+ u64 tmp;
dclc = ktime_to_ns(kt);
+ tmp = dclc < 0 ? -dclc : dclc;
+
/* Make sure the divisor is less than 2^32: */
while (div >> 32) {
sft++;
div >>= 1;
}
- dclc >>= sft;
- do_div(dclc, (unsigned long) div);
-
- return dclc;
+ tmp >>= sft;
+ do_div(tmp, (unsigned long) div);
+ return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */
@@ -441,24 +458,35 @@ static inline void debug_deactivate(struct hrtimer *timer)
}
#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
+ struct hrtimer *timer)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ cpu_base->next_timer = timer;
+#endif
+}
+
static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
{
struct hrtimer_clock_base *base = cpu_base->clock_base;
ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
- int i;
+ unsigned int active = cpu_base->active_bases;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ hrtimer_update_next_timer(cpu_base, NULL);
+ for (; active; base++, active >>= 1) {
struct timerqueue_node *next;
struct hrtimer *timer;
- next = timerqueue_getnext(&base->active);
- if (!next)
+ if (!(active & 0x01))
continue;
+ next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- if (expires.tv64 < expires_next.tv64)
+ if (expires.tv64 < expires_next.tv64) {
expires_next = expires;
+ hrtimer_update_next_timer(cpu_base, timer);
+ }
}
/*
* clock_was_set() might have changed base->offset of any of
@@ -471,6 +499,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
}
#endif
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+ ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+ ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+ ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
+
+ return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+ offs_real, offs_boot, offs_tai);
+}
+
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -478,6 +516,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
* High resolution timer enabled ?
*/
static int hrtimer_hres_enabled __read_mostly = 1;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
/*
* Enable / Disable high resolution mode
@@ -506,9 +546,14 @@ static inline int hrtimer_is_hres_enabled(void)
/*
* Is the high resolution mode active ?
*/
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+{
+ return cpu_base->hres_active;
+}
+
static inline int hrtimer_hres_active(void)
{
- return __this_cpu_read(hrtimer_bases.hres_active);
+ return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
}
/*
@@ -519,7 +564,12 @@ static inline int hrtimer_hres_active(void)
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
- ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
+ ktime_t expires_next;
+
+ if (!cpu_base->hres_active)
+ return;
+
+ expires_next = __hrtimer_get_next_event(cpu_base);
if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
return;
@@ -543,63 +593,53 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
if (cpu_base->hang_detected)
return;
- if (cpu_base->expires_next.tv64 != KTIME_MAX)
- tick_program_event(cpu_base->expires_next, 1);
+ tick_program_event(cpu_base->expires_next, 1);
}
/*
- * Shared reprogramming for clock_realtime and clock_monotonic
- *
* When a timer is enqueued and expires earlier than the already enqueued
* timers, we have to check, whether it expires earlier than the timer for
* which the clock event device was armed.
*
- * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
- * and no expiry check happens. The timer gets enqueued into the rbtree. The
- * reprogramming and expiry check is done in the hrtimer_interrupt or in the
- * softirq.
- *
* Called with interrupts disabled and base->cpu_base.lock held
*/
-static int hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+static void hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- int res;
WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
/*
- * When the callback is running, we do not reprogram the clock event
- * device. The timer callback is either running on a different CPU or
- * the callback is executed in the hrtimer_interrupt context. The
- * reprogramming is handled either by the softirq, which called the
- * callback or at the end of the hrtimer_interrupt.
+ * If the timer is not on the current cpu, we cannot reprogram
+ * the other cpus clock event device.
*/
- if (hrtimer_callback_running(timer))
- return 0;
+ if (base->cpu_base != cpu_base)
+ return;
+
+ /*
+ * If the hrtimer interrupt is running, then it will
+ * reevaluate the clock bases and reprogram the clock event
+ * device. The callbacks are always executed in hard interrupt
+ * context so we don't need an extra check for a running
+ * callback.
+ */
+ if (cpu_base->in_hrtirq)
+ return;
/*
* CLOCK_REALTIME timer might be requested with an absolute
- * expiry time which is less than base->offset. Nothing wrong
- * about that, just avoid to call into the tick code, which
- * has now objections against negative expiry values.
+ * expiry time which is less than base->offset. Set it to 0.
*/
if (expires.tv64 < 0)
- return -ETIME;
+ expires.tv64 = 0;
if (expires.tv64 >= cpu_base->expires_next.tv64)
- return 0;
+ return;
- /*
- * When the target cpu of the timer is currently executing
- * hrtimer_interrupt(), then we do not touch the clock event
- * device. hrtimer_interrupt() will reevaluate all clock bases
- * before reprogramming the device.
- */
- if (cpu_base->in_hrtirq)
- return 0;
+ /* Update the pointer to the next expiring timer */
+ cpu_base->next_timer = timer;
/*
* If a hang was detected in the last timer interrupt then we
@@ -608,15 +648,14 @@ static int hrtimer_reprogram(struct hrtimer *timer,
* to make progress.
*/
if (cpu_base->hang_detected)
- return 0;
+ return;
/*
- * Clockevents returns -ETIME, when the event was in the past.
+ * Program the timer hardware. We enforce the expiry for
+ * events which are already in the past.
*/
- res = tick_program_event(expires, 0);
- if (!IS_ERR_VALUE(res))
- cpu_base->expires_next = expires;
- return res;
+ cpu_base->expires_next = expires;
+ tick_program_event(expires, 1);
}
/*
@@ -628,15 +667,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
base->hres_active = 0;
}
-static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
-{
- ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
- ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
- ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
-
- return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
-}
-
/*
* Retrigger next event is called after clock was set
*
@@ -646,7 +676,7 @@ static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
- if (!hrtimer_hres_active())
+ if (!base->hres_active)
return;
raw_spin_lock(&base->lock);
@@ -658,32 +688,21 @@ static void retrigger_next_event(void *arg)
/*
* Switch to high resolution mode
*/
-static int hrtimer_switch_to_hres(void)
+static void hrtimer_switch_to_hres(void)
{
- int i, cpu = smp_processor_id();
- struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
- unsigned long flags;
-
- if (base->hres_active)
- return 1;
-
- local_irq_save(flags);
+ struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (tick_init_highres()) {
- local_irq_restore(flags);
printk(KERN_WARNING "Could not switch to high resolution "
- "mode on CPU %d\n", cpu);
- return 0;
+ "mode on CPU %d\n", base->cpu);
+ return;
}
base->hres_active = 1;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
- base->clock_base[i].resolution = KTIME_HIGH_RES;
+ hrtimer_resolution = HIGH_RES_NSEC;
tick_setup_sched_timer();
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
- local_irq_restore(flags);
- return 1;
}
static void clock_was_set_work(struct work_struct *work)
@@ -704,9 +723,10 @@ void clock_was_set_delayed(void)
#else
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
-static inline int hrtimer_switch_to_hres(void) { return 0; }
+static inline void hrtimer_switch_to_hres(void) { }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
static inline int hrtimer_reprogram(struct hrtimer *timer,
@@ -801,6 +821,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
*
* Forward the timer expiry so it will expire in the future.
* Returns the number of overruns.
+ *
+ * Can be safely called from the callback function of @timer. If
+ * called from other contexts @timer must neither be enqueued nor
+ * running the callback and the caller needs to take care of
+ * serialization.
+ *
+ * Note: This only updates the timer expiry value and does not requeue
+ * the timer.
*/
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
@@ -812,8 +840,11 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
if (delta.tv64 < 0)
return 0;
- if (interval.tv64 < timer->base->resolution.tv64)
- interval.tv64 = timer->base->resolution.tv64;
+ if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+ return 0;
+
+ if (interval.tv64 < hrtimer_resolution)
+ interval.tv64 = hrtimer_resolution;
if (unlikely(delta.tv64 >= interval.tv64)) {
s64 incr = ktime_to_ns(interval);
@@ -847,16 +878,11 @@ static int enqueue_hrtimer(struct hrtimer *timer,
{
debug_activate(timer);
- timerqueue_add(&base->active, &timer->node);
base->cpu_base->active_bases |= 1 << base->index;
- /*
- * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
- * state of a possibly running callback.
- */
- timer->state |= HRTIMER_STATE_ENQUEUED;
+ timer->state = HRTIMER_STATE_ENQUEUED;
- return (&timer->node == base->active.next);
+ return timerqueue_add(&base->active, &timer->node);
}
/*
@@ -873,39 +899,38 @@ static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
unsigned long newstate, int reprogram)
{
- struct timerqueue_node *next_timer;
- if (!(timer->state & HRTIMER_STATE_ENQUEUED))
- goto out;
+ struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+ unsigned int state = timer->state;
+
+ timer->state = newstate;
+ if (!(state & HRTIMER_STATE_ENQUEUED))
+ return;
+
+ if (!timerqueue_del(&base->active, &timer->node))
+ cpu_base->active_bases &= ~(1 << base->index);
- next_timer = timerqueue_getnext(&base->active);
- timerqueue_del(&base->active, &timer->node);
- if (&timer->node == next_timer) {
#ifdef CONFIG_HIGH_RES_TIMERS
- /* Reprogram the clock event device. if enabled */
- if (reprogram && hrtimer_hres_active()) {
- ktime_t expires;
-
- expires = ktime_sub(hrtimer_get_expires(timer),
- base->offset);
- if (base->cpu_base->expires_next.tv64 == expires.tv64)
- hrtimer_force_reprogram(base->cpu_base, 1);
- }
+ /*
+ * Note: If reprogram is false we do not update
+ * cpu_base->next_timer. This happens when we remove the first
+ * timer on a remote cpu. No harm as we never dereference
+ * cpu_base->next_timer. So the worst thing what can happen is
+ * an superflous call to hrtimer_force_reprogram() on the
+ * remote cpu later on if the same timer gets enqueued again.
+ */
+ if (reprogram && timer == cpu_base->next_timer)
+ hrtimer_force_reprogram(cpu_base, 1);
#endif
- }
- if (!timerqueue_getnext(&base->active))
- base->cpu_base->active_bases &= ~(1 << base->index);
-out:
- timer->state = newstate;
}
/*
* remove hrtimer, called with base lock held
*/
static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
{
if (hrtimer_is_queued(timer)) {
- unsigned long state;
+ unsigned long state = timer->state;
int reprogram;
/*
@@ -919,30 +944,35 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
- /*
- * We must preserve the CALLBACK state flag here,
- * otherwise we could move the timer base in
- * switch_hrtimer_base.
- */
- state = timer->state & HRTIMER_STATE_CALLBACK;
+
+ if (!restart)
+ state = HRTIMER_STATE_INACTIVE;
+
__remove_hrtimer(timer, base, state, reprogram);
return 1;
}
return 0;
}
-int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long delta_ns, const enum hrtimer_mode mode,
- int wakeup)
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
+ */
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long delta_ns, const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
- int ret, leftmost;
+ int leftmost;
base = lock_hrtimer_base(timer, &flags);
/* Remove an active timer from the queue: */
- ret = remove_hrtimer(timer, base);
+ remove_hrtimer(timer, base, true);
if (mode & HRTIMER_MODE_REL) {
tim = ktime_add_safe(tim, base->get_time());
@@ -954,7 +984,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* timeouts. This will go away with the GTOD framework.
*/
#ifdef CONFIG_TIME_LOW_RES
- tim = ktime_add_safe(tim, base->resolution);
+ tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
#endif
}
@@ -966,85 +996,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
timer_stats_hrtimer_set_start_info(timer);
leftmost = enqueue_hrtimer(timer, new_base);
-
- if (!leftmost) {
- unlock_hrtimer_base(timer, &flags);
- return ret;
- }
+ if (!leftmost)
+ goto unlock;
if (!hrtimer_is_hres_active(timer)) {
/*
* Kick to reschedule the next tick to handle the new timer
* on dynticks target.
*/
- wake_up_nohz_cpu(new_base->cpu_base->cpu);
- } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
- hrtimer_reprogram(timer, new_base)) {
- /*
- * Only allow reprogramming if the new base is on this CPU.
- * (it might still be on another CPU if the timer was pending)
- *
- * XXX send_remote_softirq() ?
- */
- if (wakeup) {
- /*
- * We need to drop cpu_base->lock to avoid a
- * lock ordering issue vs. rq->lock.
- */
- raw_spin_unlock(&new_base->cpu_base->lock);
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- local_irq_restore(flags);
- return ret;
- } else {
- __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- }
+ if (new_base->cpu_base->nohz_active)
+ wake_up_nohz_cpu(new_base->cpu_base->cpu);
+ } else {
+ hrtimer_reprogram(timer, new_base);
}
-
+unlock:
unlock_hrtimer_base(timer, &flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
-
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
- *
- * Returns:
- * 0 on success
- * 1 when the timer was active
- */
-int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long delta_ns, const enum hrtimer_mode mode)
-{
- return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
/**
- * hrtimer_start - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
- *
- * Returns:
- * 0 on success
- * 1 when the timer was active
- */
-int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
-{
- return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
-}
-EXPORT_SYMBOL_GPL(hrtimer_start);
-
-
-/**
* hrtimer_try_to_cancel - try to deactivate a timer
* @timer: hrtimer to stop
*
@@ -1060,10 +1030,19 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
unsigned long flags;
int ret = -1;
+ /*
+ * Check lockless first. If the timer is not active (neither
+ * enqueued nor running the callback, nothing to do here. The
+ * base lock does not serialize against a concurrent enqueue,
+ * so we can avoid taking it.
+ */
+ if (!hrtimer_active(timer))
+ return 0;
+
base = lock_hrtimer_base(timer, &flags);
if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base);
+ ret = remove_hrtimer(timer, base, false);
unlock_hrtimer_base(timer, &flags);
@@ -1113,26 +1092,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
/**
* hrtimer_get_next_event - get the time until next expiry event
*
- * Returns the delta to the next expiry event or KTIME_MAX if no timer
- * is pending.
+ * Returns the next expiry time or KTIME_MAX if no timer is pending.
*/
-ktime_t hrtimer_get_next_event(void)
+u64 hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- ktime_t mindelta = { .tv64 = KTIME_MAX };
+ u64 expires = KTIME_MAX;
unsigned long flags;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (!hrtimer_hres_active())
- mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
- ktime_get());
+ if (!__hrtimer_hres_active(cpu_base))
+ expires = __hrtimer_get_next_event(cpu_base).tv64;
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
- if (mindelta.tv64 < 0)
- mindelta.tv64 = 0;
- return mindelta;
+ return expires;
}
#endif
@@ -1174,37 +1149,73 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
}
EXPORT_SYMBOL_GPL(hrtimer_init);
-/**
- * hrtimer_get_res - get the timer resolution for a clock
- * @which_clock: which clock to query
- * @tp: pointer to timespec variable to store the resolution
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
*
- * Store the resolution of the clock selected by @which_clock in the
- * variable pointed to by @tp.
+ * It is important for this function to not return a false negative.
*/
-int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
+bool hrtimer_active(const struct hrtimer *timer)
{
struct hrtimer_cpu_base *cpu_base;
- int base = hrtimer_clockid_to_base(which_clock);
+ unsigned int seq;
- cpu_base = raw_cpu_ptr(&hrtimer_bases);
- *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
+ do {
+ cpu_base = READ_ONCE(timer->base->cpu_base);
+ seq = raw_read_seqcount_begin(&cpu_base->seq);
- return 0;
+ if (timer->state != HRTIMER_STATE_INACTIVE ||
+ cpu_base->running == timer)
+ return true;
+
+ } while (read_seqcount_retry(&cpu_base->seq, seq) ||
+ cpu_base != READ_ONCE(timer->base->cpu_base));
+
+ return false;
}
-EXPORT_SYMBOL_GPL(hrtimer_get_res);
+EXPORT_SYMBOL_GPL(hrtimer_active);
-static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+/*
+ * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+ * distinct sections:
+ *
+ * - queued: the timer is queued
+ * - callback: the timer is being ran
+ * - post: the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * The sequence numbers are required because otherwise we could still observe
+ * a false negative if the read side got smeared over multiple consequtive
+ * __run_hrtimer() invocations.
+ */
+
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
+ struct hrtimer_clock_base *base,
+ struct hrtimer *timer, ktime_t *now)
{
- struct hrtimer_clock_base *base = timer->base;
- struct hrtimer_cpu_base *cpu_base = base->cpu_base;
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer);
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ cpu_base->running = timer;
+
+ /*
+ * Separate the ->running assignment from the ->state assignment.
+ *
+ * As with a regular write barrier, this ensures the read side in
+ * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * timer->state == INACTIVE.
+ */
+ raw_write_seqcount_barrier(&cpu_base->seq);
+
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
@@ -1220,58 +1231,43 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
raw_spin_lock(&cpu_base->lock);
/*
- * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+ * Note: We clear the running state after enqueue_hrtimer and
* we do not reprogramm the event hardware. Happens either in
* hrtimer_start_range_ns() or in hrtimer_interrupt()
+ *
+ * Note: Because we dropped the cpu_base->lock above,
+ * hrtimer_start_range_ns() can have popped in and enqueued the timer
+ * for us already.
*/
- if (restart != HRTIMER_NORESTART) {
- BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+ if (restart != HRTIMER_NORESTART &&
+ !(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base);
- }
- WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+ /*
+ * Separate the ->running assignment from the ->state assignment.
+ *
+ * As with a regular write barrier, this ensures the read side in
+ * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * timer->state == INACTIVE.
+ */
+ raw_write_seqcount_barrier(&cpu_base->seq);
- timer->state &= ~HRTIMER_STATE_CALLBACK;
+ WARN_ON_ONCE(cpu_base->running != timer);
+ cpu_base->running = NULL;
}
-#ifdef CONFIG_HIGH_RES_TIMERS
-
-/*
- * High resolution timer interrupt
- * Called with interrupts disabled
- */
-void hrtimer_interrupt(struct clock_event_device *dev)
+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
{
- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- ktime_t expires_next, now, entry_time, delta;
- int i, retries = 0;
-
- BUG_ON(!cpu_base->hres_active);
- cpu_base->nr_events++;
- dev->next_event.tv64 = KTIME_MAX;
-
- raw_spin_lock(&cpu_base->lock);
- entry_time = now = hrtimer_update_base(cpu_base);
-retry:
- cpu_base->in_hrtirq = 1;
- /*
- * We set expires_next to KTIME_MAX here with cpu_base->lock
- * held to prevent that a timer is enqueued in our queue via
- * the migration code. This does not affect enqueueing of
- * timers which run their callback and need to be requeued on
- * this CPU.
- */
- cpu_base->expires_next.tv64 = KTIME_MAX;
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
+ unsigned int active = cpu_base->active_bases;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- struct hrtimer_clock_base *base;
+ for (; active; base++, active >>= 1) {
struct timerqueue_node *node;
ktime_t basenow;
- if (!(cpu_base->active_bases & (1 << i)))
+ if (!(active & 0x01))
continue;
- base = cpu_base->clock_base + i;
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
@@ -1294,9 +1290,42 @@ retry:
if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
break;
- __run_hrtimer(timer, &basenow);
+ __run_hrtimer(cpu_base, base, timer, &basenow);
}
}
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t expires_next, now, entry_time, delta;
+ int retries = 0;
+
+ BUG_ON(!cpu_base->hres_active);
+ cpu_base->nr_events++;
+ dev->next_event.tv64 = KTIME_MAX;
+
+ raw_spin_lock(&cpu_base->lock);
+ entry_time = now = hrtimer_update_base(cpu_base);
+retry:
+ cpu_base->in_hrtirq = 1;
+ /*
+ * We set expires_next to KTIME_MAX here with cpu_base->lock
+ * held to prevent that a timer is enqueued in our queue via
+ * the migration code. This does not affect enqueueing of
+ * timers which run their callback and need to be requeued on
+ * this CPU.
+ */
+ cpu_base->expires_next.tv64 = KTIME_MAX;
+
+ __hrtimer_run_queues(cpu_base, now);
+
/* Reevaluate the clock bases for the next expiry */
expires_next = __hrtimer_get_next_event(cpu_base);
/*
@@ -1308,8 +1337,7 @@ retry:
raw_spin_unlock(&cpu_base->lock);
/* Reprogramming necessary ? */
- if (expires_next.tv64 == KTIME_MAX ||
- !tick_program_event(expires_next, 0)) {
+ if (!tick_program_event(expires_next, 0)) {
cpu_base->hang_detected = 0;
return;
}
@@ -1342,8 +1370,8 @@ retry:
cpu_base->hang_detected = 1;
raw_spin_unlock(&cpu_base->lock);
delta = ktime_sub(now, entry_time);
- if (delta.tv64 > cpu_base->max_hang_time.tv64)
- cpu_base->max_hang_time = delta;
+ if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
+ cpu_base->max_hang_time = (unsigned int) delta.tv64;
/*
* Limit it to a sensible value as we enforce a longer
* delay. Give the CPU at least 100ms to catch up.
@@ -1361,7 +1389,7 @@ retry:
* local version of hrtimer_peek_ahead_timers() called with interrupts
* disabled.
*/
-static void __hrtimer_peek_ahead_timers(void)
+static inline void __hrtimer_peek_ahead_timers(void)
{
struct tick_device *td;
@@ -1373,29 +1401,6 @@ static void __hrtimer_peek_ahead_timers(void)
hrtimer_interrupt(td->evtdev);
}
-/**
- * hrtimer_peek_ahead_timers -- run soft-expired timers now
- *
- * hrtimer_peek_ahead_timers will peek at the timer queue of
- * the current cpu and check if there are any timers for which
- * the soft expires time has passed. If any such timers exist,
- * they are run immediately and then removed from the timer queue.
- *
- */
-void hrtimer_peek_ahead_timers(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __hrtimer_peek_ahead_timers();
- local_irq_restore(flags);
-}
-
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
- hrtimer_peek_ahead_timers();
-}
-
#else /* CONFIG_HIGH_RES_TIMERS */
static inline void __hrtimer_peek_ahead_timers(void) { }
@@ -1403,66 +1408,32 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
+ * Called from run_local_timers in hardirq context every jiffy
*/
-void hrtimer_run_pending(void)
+void hrtimer_run_queues(void)
{
- if (hrtimer_hres_active())
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t now;
+
+ if (__hrtimer_hres_active(cpu_base))
return;
/*
- * This _is_ ugly: We have to check in the softirq context,
- * whether we can switch to highres and / or nohz mode. The
- * clocksource switch happens in the timer interrupt with
- * xtime_lock held. Notification from there only sets the
- * check bit in the tick_oneshot code, otherwise we might
- * deadlock vs. xtime_lock.
+ * This _is_ ugly: We have to check periodically, whether we
+ * can switch to highres and / or nohz mode. The clocksource
+ * switch happens with xtime_lock held. Notification from
+ * there only sets the check bit in the tick_oneshot code,
+ * otherwise we might deadlock vs. xtime_lock.
*/
- if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
hrtimer_switch_to_hres();
-}
-
-/*
- * Called from hardirq context every jiffy
- */
-void hrtimer_run_queues(void)
-{
- struct timerqueue_node *node;
- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- struct hrtimer_clock_base *base;
- int index, gettime = 1;
-
- if (hrtimer_hres_active())
return;
-
- for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
- base = &cpu_base->clock_base[index];
- if (!timerqueue_getnext(&base->active))
- continue;
-
- if (gettime) {
- hrtimer_get_softirq_time(cpu_base);
- gettime = 0;
- }
-
- raw_spin_lock(&cpu_base->lock);
-
- while ((node = timerqueue_getnext(&base->active))) {
- struct hrtimer *timer;
-
- timer = container_of(node, struct hrtimer, node);
- if (base->softirq_time.tv64 <=
- hrtimer_get_expires_tv64(timer))
- break;
-
- __run_hrtimer(timer, &base->softirq_time);
- }
- raw_spin_unlock(&cpu_base->lock);
}
+
+ raw_spin_lock(&cpu_base->lock);
+ now = hrtimer_update_base(cpu_base);
+ __hrtimer_run_queues(cpu_base, now);
+ raw_spin_unlock(&cpu_base->lock);
}
/*
@@ -1495,8 +1466,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start_expires(&t->timer, mode);
- if (!hrtimer_active(&t->timer))
- t->task = NULL;
if (likely(t->task))
freezable_schedule();
@@ -1640,11 +1609,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
debug_deactivate(timer);
/*
- * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+ * Mark it as ENQUEUED not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
@@ -1655,9 +1624,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* event device.
*/
enqueue_hrtimer(timer, new_base);
-
- /* Clear the migration state bit */
- timer->state &= ~HRTIMER_STATE_MIGRATE;
}
}
@@ -1707,17 +1673,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
- break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- {
- clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
migrate_hrtimers(scpu);
break;
- }
#endif
default:
@@ -1736,9 +1695,6 @@ void __init hrtimers_init(void)
hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
- open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
}
/**
@@ -1777,8 +1733,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
hrtimer_init_sleeper(&t, current);
hrtimer_start_expires(&t.timer, mode);
- if (!hrtimer_active(&t.timer))
- t.task = NULL;
if (likely(t.task))
schedule();
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..347fecf86a3f 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -25,7 +25,7 @@
#include <linux/module.h>
#include <linux/init.h>
-#include "tick-internal.h"
+#include "timekeeping.h"
/* The Jiffies based clocksource is the lowest common
* denominator clock source which should function on
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
};
__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
static int __init init_jiffies_clocksource(void)
{
- return clocksource_register(&clocksource_jiffies);
+ return __clocksource_register(&clocksource_jiffies);
}
core_initcall(init_jiffies_clocksource);
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
- clocksource_register(&refined_jiffies);
+ __clocksource_register(&refined_jiffies);
return 0;
}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 0f60b08a4f07..df68cb875248 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,7 +17,6 @@
#include <linux/module.h>
#include <linux/rtc.h>
-#include "tick-internal.h"
#include "ntp_internal.h"
/*
@@ -36,6 +35,7 @@ unsigned long tick_nsec;
static u64 tick_length;
static u64 tick_length_base;
+#define SECS_PER_DAY 86400
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -77,6 +77,9 @@ static long time_adjust;
/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
static s64 ntp_tick_adj;
+/* second value of the next pending leapsecond, or TIME64_MAX if no leap */
+static time64_t ntp_next_leap_sec = TIME64_MAX;
+
#ifdef CONFIG_NTP_PPS
/*
@@ -350,6 +353,7 @@ void ntp_clear(void)
tick_length = tick_length_base;
time_offset = 0;
+ ntp_next_leap_sec = TIME64_MAX;
/* Clear PPS state variables */
pps_clear();
}
@@ -360,6 +364,21 @@ u64 ntp_tick_length(void)
return tick_length;
}
+/**
+ * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ *
+ * Provides the time of the next leapsecond against CLOCK_REALTIME in
+ * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ */
+ktime_t ntp_get_next_leap(void)
+{
+ ktime_t ret;
+
+ if ((time_state == TIME_INS) && (time_status & STA_INS))
+ return ktime_set(ntp_next_leap_sec, 0);
+ ret.tv64 = KTIME_MAX;
+ return ret;
+}
/*
* this routine handles the overflow of the microsecond field
@@ -383,15 +402,21 @@ int second_overflow(unsigned long secs)
*/
switch (time_state) {
case TIME_OK:
- if (time_status & STA_INS)
+ if (time_status & STA_INS) {
time_state = TIME_INS;
- else if (time_status & STA_DEL)
+ ntp_next_leap_sec = secs + SECS_PER_DAY -
+ (secs % SECS_PER_DAY);
+ } else if (time_status & STA_DEL) {
time_state = TIME_DEL;
+ ntp_next_leap_sec = secs + SECS_PER_DAY -
+ ((secs+1) % SECS_PER_DAY);
+ }
break;
case TIME_INS:
- if (!(time_status & STA_INS))
+ if (!(time_status & STA_INS)) {
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- else if (secs % 86400 == 0) {
+ } else if (secs % SECS_PER_DAY == 0) {
leap = -1;
time_state = TIME_OOP;
printk(KERN_NOTICE
@@ -399,19 +424,21 @@ int second_overflow(unsigned long secs)
}
break;
case TIME_DEL:
- if (!(time_status & STA_DEL))
+ if (!(time_status & STA_DEL)) {
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- else if ((secs + 1) % 86400 == 0) {
+ } else if ((secs + 1) % SECS_PER_DAY == 0) {
leap = 1;
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_WAIT;
printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
}
break;
case TIME_OOP:
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_WAIT;
break;
-
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
@@ -459,6 +486,21 @@ out:
return leap;
}
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock(struct timespec now)
+{
+ return -ENODEV;
+}
+
+int __weak update_persistent_clock64(struct timespec64 now64)
+{
+ struct timespec now;
+
+ now = timespec64_to_timespec(now64);
+ return update_persistent_clock(now);
+}
+#endif
+
#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
static void sync_cmos_clock(struct work_struct *work);
@@ -494,8 +536,9 @@ static void sync_cmos_clock(struct work_struct *work)
if (persistent_clock_is_local)
adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
#ifdef CONFIG_GENERIC_CMOS_UPDATE
- fail = update_persistent_clock(timespec64_to_timespec(adjust));
+ fail = update_persistent_clock64(adjust);
#endif
+
#ifdef CONFIG_RTC_SYSTOHC
if (fail == -ENODEV)
fail = rtc_set_ntp_time(adjust);
@@ -537,6 +580,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
time_status = STA_UNSYNC;
+ ntp_next_leap_sec = TIME64_MAX;
/* restart PPS frequency calibration */
pps_reset_freq_interval();
}
@@ -701,6 +745,24 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
if (!(time_status & STA_NANO))
txc->time.tv_usec /= NSEC_PER_USEC;
+ /* Handle leapsec adjustments */
+ if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
+ if ((time_state == TIME_INS) && (time_status & STA_INS)) {
+ result = TIME_OOP;
+ txc->tai++;
+ txc->time.tv_sec--;
+ }
+ if ((time_state == TIME_DEL) && (time_status & STA_DEL)) {
+ result = TIME_WAIT;
+ txc->tai--;
+ txc->time.tv_sec++;
+ }
+ if ((time_state == TIME_OOP) &&
+ (ts->tv_sec == ntp_next_leap_sec)) {
+ result = TIME_WAIT;
+ }
+ }
+
return result;
}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index bbd102ad9df7..65430504ca26 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -5,6 +5,7 @@ extern void ntp_init(void);
extern void ntp_clear(void);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
extern u64 ntp_tick_length(void);
+extern ktime_t ntp_get_next_leap(void);
extern int second_overflow(unsigned long secs);
extern int ntp_validate_timex(struct timex *);
extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da74abf0..892e3dae0aac 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
return 0;
}
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- if (b->utime > a->utime)
- a->utime = b->utime;
+ u64 curr_cputime;
+retry:
+ curr_cputime = atomic64_read(cputime);
+ if (sum_cputime > curr_cputime) {
+ if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+ goto retry;
+ }
+}
- if (b->stime > a->stime)
- a->stime = b->stime;
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+{
+ __update_gt_cputime(&cputime_atomic->utime, sum->utime);
+ __update_gt_cputime(&cputime_atomic->stime, sum->stime);
+ __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
+}
- if (b->sum_exec_runtime > a->sum_exec_runtime)
- a->sum_exec_runtime = b->sum_exec_runtime;
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
+static inline void sample_cputime_atomic(struct task_cputime *times,
+ struct task_cputime_atomic *atomic_times)
+{
+ times->utime = atomic64_read(&atomic_times->utime);
+ times->stime = atomic64_read(&atomic_times->stime);
+ times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
}
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
struct task_cputime sum;
- unsigned long flags;
- if (!cputimer->running) {
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running)) {
/*
* The POSIX timer interface allows for absolute time expiry
* values through the TIMER_ABSTIME flag, therefore we have
- * to synchronize the timer to the clock every time we start
- * it.
+ * to synchronize the timer to the clock every time we start it.
*/
thread_group_cputime(tsk, &sum);
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 1;
- update_gt_cputime(&cputimer->cputime, &sum);
- } else
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- *times = cputimer->cputime;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ update_gt_cputime(&cputimer->cputime_atomic, &sum);
+
+ /*
+ * We're setting cputimer->running without a lock. Ensure
+ * this only gets written to in one operation. We set
+ * running after update_gt_cputime() as a small optimization,
+ * but barriers are not required because update_gt_cputime()
+ * can handle concurrent updates.
+ */
+ WRITE_ONCE(cputimer->running, 1);
+ }
+ sample_cputime_atomic(times, &cputimer->cputime_atomic);
}
/*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
if (!task_cputime_zero(&tsk->cputime_expires))
return false;
- if (tsk->signal->cputimer.running)
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(tsk->signal->cputimer.running))
return false;
return true;
@@ -852,10 +876,10 @@ static void check_thread_timers(struct task_struct *tsk,
/*
* Check for the special case thread timers.
*/
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
}
}
-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
{
struct thread_group_cputimer *cputimer = &sig->cputimer;
- unsigned long flags;
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 0;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ /* Turn off cputimer->running. This is done without locking. */
+ WRITE_ONCE(cputimer->running, 0);
}
static u32 onecputick;
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long psecs = cputime_to_secs(ptime);
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
cputime_t x;
if (psecs >= hard) {
/*
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
}
sig = tsk->signal;
- if (sig->cputimer.running) {
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(sig->cputimer.running)) {
struct task_cputime group_sample;
- raw_spin_lock(&sig->cputimer.lock);
- group_sample = sig->cputimer.cputime;
- raw_spin_unlock(&sig->cputimer.lock);
+ sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* If there are any active process wide timers (POSIX 1.b, itimers,
* RLIMIT_CPU) cputimer must be running.
*/
- if (tsk->signal->cputimer.running)
+ if (READ_ONCE(tsk->signal->cputimer.running))
check_process_timers(tsk, &firing);
/*
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 31ea01f42e1f..31d11ac9fa47 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
return 0;
}
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+{
+ tp->tv_sec = 0;
+ tp->tv_nsec = hrtimer_resolution;
+ return 0;
+}
+
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
static __init int init_posix_timers(void)
{
struct k_clock clock_realtime = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_clock_realtime_get,
.clock_set = posix_clock_realtime_set,
.clock_adj = posix_clock_realtime_adj,
@@ -290,7 +297,7 @@ static __init int init_posix_timers(void)
.timer_del = common_timer_del,
};
struct k_clock clock_monotonic = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_ktime_get_ts,
.nsleep = common_nsleep,
.nsleep_restart = hrtimer_nanosleep_restart,
@@ -300,7 +307,7 @@ static __init int init_posix_timers(void)
.timer_del = common_timer_del,
};
struct k_clock clock_monotonic_raw = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_get_monotonic_raw,
};
struct k_clock clock_realtime_coarse = {
@@ -312,7 +319,7 @@ static __init int init_posix_timers(void)
.clock_get = posix_get_monotonic_coarse,
};
struct k_clock clock_tai = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_get_tai,
.nsleep = common_nsleep,
.nsleep_restart = hrtimer_nanosleep_restart,
@@ -322,7 +329,7 @@ static __init int init_posix_timers(void)
.timer_del = common_timer_del,
};
struct k_clock clock_boottime = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_get_boottime,
.nsleep = common_nsleep,
.nsleep_restart = hrtimer_nanosleep_restart,
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a26036d37a38 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,5 +1,6 @@
/*
- * sched_clock.c: support for extending counters to full 64-bit ns counter
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ * hardware time counters to full 64-bit ns values.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -18,15 +19,53 @@
#include <linux/seqlock.h>
#include <linux/bitops.h>
-struct clock_data {
- ktime_t wrap_kt;
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns: sched_clock() value at last update
+ * @epoch_cyc: Clock cycle value at last update.
+ * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
+ * clocks.
+ * @read_sched_clock: Current clock source (or dummy source when suspended).
+ * @mult: Multipler for scaled math conversion.
+ * @shift: Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
u64 epoch_ns;
u64 epoch_cyc;
- seqcount_t seq;
- unsigned long rate;
+ u64 sched_clock_mask;
+ u64 (*read_sched_clock)(void);
u32 mult;
u32 shift;
- bool suspended;
+};
+
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ * registration of a new clock source)
+ *
+ * @seq: Sequence counter for protecting updates. The lowest
+ * bit is the index for @read_data.
+ * @read_data: Data required to read from sched_clock.
+ * @wrap_kt: Duration for which clock can run before wrapping.
+ * @rate: Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+ seqcount_t seq;
+ struct clock_read_data read_data[2];
+ ktime_t wrap_kt;
+ unsigned long rate;
+
+ u64 (*actual_read_sched_clock)(void);
};
static struct hrtimer sched_clock_timer;
@@ -34,12 +73,6 @@ static int irqtime = -1;
core_param(irqtime, irqtime, int, 0400);
-static struct clock_data cd = {
- .mult = NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
static u64 notrace jiffy_sched_clock_read(void)
{
/*
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
return (u64)(jiffies - INITIAL_JIFFIES);
}
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd ____cacheline_aligned = {
+ .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+ .read_sched_clock = jiffy_sched_clock_read, },
+ .actual_read_sched_clock = jiffy_sched_clock_read,
+};
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
unsigned long long notrace sched_clock(void)
{
- u64 epoch_ns;
- u64 epoch_cyc;
- u64 cyc;
+ u64 cyc, res;
unsigned long seq;
-
- if (cd.suspended)
- return cd.epoch_ns;
+ struct clock_read_data *rd;
do {
- seq = raw_read_seqcount_begin(&cd.seq);
- epoch_cyc = cd.epoch_cyc;
- epoch_ns = cd.epoch_ns;
+ seq = raw_read_seqcount(&cd.seq);
+ rd = cd.read_data + (seq & 1);
+
+ cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+ rd->sched_clock_mask;
+ res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
} while (read_seqcount_retry(&cd.seq, seq));
- cyc = read_sched_clock();
- cyc = (cyc - epoch_cyc) & sched_clock_mask;
- return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+ return res;
+}
+
+/*
+ * Updating the data required to read the clock.
+ *
+ * sched_clock() will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ */
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+ /* update the backup (odd) copy with the new data */
+ cd.read_data[1] = *rd;
+
+ /* steer readers towards the odd copy */
+ raw_write_seqcount_latch(&cd.seq);
+
+ /* now its safe for us to update the normal (even) copy */
+ cd.read_data[0] = *rd;
+
+ /* switch readers back to the even copy */
+ raw_write_seqcount_latch(&cd.seq);
}
/*
- * Atomically update the sched_clock epoch.
+ * Atomically update the sched_clock() epoch.
*/
-static void notrace update_sched_clock(void)
+static void update_sched_clock(void)
{
- unsigned long flags;
u64 cyc;
u64 ns;
+ struct clock_read_data rd;
+
+ rd = cd.read_data[0];
+
+ cyc = cd.actual_read_sched_clock();
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+
+ rd.epoch_ns = ns;
+ rd.epoch_cyc = cyc;
- cyc = read_sched_clock();
- ns = cd.epoch_ns +
- cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
- cd.mult, cd.shift);
-
- raw_local_irq_save(flags);
- raw_write_seqcount_begin(&cd.seq);
- cd.epoch_ns = ns;
- cd.epoch_cyc = cyc;
- raw_write_seqcount_end(&cd.seq);
- raw_local_irq_restore(flags);
+ update_clock_read_data(&rd);
}
static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
{
update_sched_clock();
hrtimer_forward_now(hrt, cd.wrap_kt);
+
return HRTIMER_RESTART;
}
-void __init sched_clock_register(u64 (*read)(void), int bits,
- unsigned long rate)
+void __init
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
- ktime_t new_wrap_kt;
unsigned long r;
char r_unit;
+ struct clock_read_data rd;
if (cd.rate > rate)
return;
WARN_ON(!irqs_disabled());
- /* calculate the mult/shift to convert counter ticks to ns. */
+ /* Calculate the mult/shift to convert counter ticks to ns. */
clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
new_mask = CLOCKSOURCE_MASK(bits);
+ cd.rate = rate;
+
+ /* Calculate how many nanosecs until we risk wrapping */
+ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
+ cd.wrap_kt = ns_to_ktime(wrap);
- /* calculate how many ns until we wrap */
- wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
- new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+ rd = cd.read_data[0];
- /* update epoch for new counter and update epoch_ns from old counter*/
+ /* Update epoch for new counter and update 'epoch_ns' from old counter*/
new_epoch = read();
- cyc = read_sched_clock();
- ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
- cd.mult, cd.shift);
+ cyc = cd.actual_read_sched_clock();
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+ cd.actual_read_sched_clock = read;
- raw_write_seqcount_begin(&cd.seq);
- read_sched_clock = read;
- sched_clock_mask = new_mask;
- cd.rate = rate;
- cd.wrap_kt = new_wrap_kt;
- cd.mult = new_mult;
- cd.shift = new_shift;
- cd.epoch_cyc = new_epoch;
- cd.epoch_ns = ns;
- raw_write_seqcount_end(&cd.seq);
+ rd.read_sched_clock = read;
+ rd.sched_clock_mask = new_mask;
+ rd.mult = new_mult;
+ rd.shift = new_shift;
+ rd.epoch_cyc = new_epoch;
+ rd.epoch_ns = ns;
+
+ update_clock_read_data(&rd);
r = rate;
if (r >= 4000000) {
r /= 1000000;
r_unit = 'M';
- } else if (r >= 1000) {
- r /= 1000;
- r_unit = 'k';
- } else
- r_unit = ' ';
-
- /* calculate the ns resolution of this counter */
+ } else {
+ if (r >= 1000) {
+ r /= 1000;
+ r_unit = 'k';
+ } else {
+ r_unit = ' ';
+ }
+ }
+
+ /* Calculate the ns resolution of this counter */
res = cyc_to_ns(1ULL, new_mult, new_shift);
pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
bits, r, r_unit, res, wrap);
- /* Enable IRQ time accounting if we have a fast enough sched_clock */
+ /* Enable IRQ time accounting if we have a fast enough sched_clock() */
if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
enable_sched_clock_irqtime();
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
void __init sched_clock_postinit(void)
{
/*
- * If no sched_clock function has been provided at that point,
+ * If no sched_clock() function has been provided at that point,
* make it the final one one.
*/
- if (read_sched_clock == jiffy_sched_clock_read)
+ if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock();
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
}
+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of 'epoch_cyc'.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+ unsigned long seq = raw_read_seqcount(&cd.seq);
+
+ return cd.read_data[seq & 1].epoch_cyc;
+}
+
static int sched_clock_suspend(void)
{
+ struct clock_read_data *rd = &cd.read_data[0];
+
update_sched_clock();
hrtimer_cancel(&sched_clock_timer);
- cd.suspended = true;
+ rd->read_sched_clock = suspended_sched_clock_read;
+
return 0;
}
static void sched_clock_resume(void)
{
- cd.epoch_cyc = read_sched_clock();
+ struct clock_read_data *rd = &cd.read_data[0];
+
+ rd->epoch_cyc = cd.actual_read_sched_clock();
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
- cd.suspended = false;
+ rd->read_sched_clock = cd.actual_read_sched_clock;
}
static struct syscore_ops sched_clock_ops = {
- .suspend = sched_clock_suspend,
- .resume = sched_clock_resume,
+ .suspend = sched_clock_suspend,
+ .resume = sched_clock_resume,
};
static int __init sched_clock_syscore_init(void)
{
register_syscore_ops(&sched_clock_ops);
+
return 0;
}
device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 6aac4beedbbe..53d7184da0be 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -18,29 +18,23 @@
static struct hrtimer bctimer;
-static void bc_set_mode(enum clock_event_mode mode,
- struct clock_event_device *bc)
+static int bc_shutdown(struct clock_event_device *evt)
{
- switch (mode) {
- case CLOCK_EVT_MODE_SHUTDOWN:
- /*
- * Note, we cannot cancel the timer here as we might
- * run into the following live lock scenario:
- *
- * cpu 0 cpu1
- * lock(broadcast_lock);
- * hrtimer_interrupt()
- * bc_handler()
- * tick_handle_oneshot_broadcast();
- * lock(broadcast_lock);
- * hrtimer_cancel()
- * wait_for_callback()
- */
- hrtimer_try_to_cancel(&bctimer);
- break;
- default:
- break;
- }
+ /*
+ * Note, we cannot cancel the timer here as we might
+ * run into the following live lock scenario:
+ *
+ * cpu 0 cpu1
+ * lock(broadcast_lock);
+ * hrtimer_interrupt()
+ * bc_handler()
+ * tick_handle_oneshot_broadcast();
+ * lock(broadcast_lock);
+ * hrtimer_cancel()
+ * wait_for_callback()
+ */
+ hrtimer_try_to_cancel(&bctimer);
+ return 0;
}
/*
@@ -66,9 +60,11 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* hrtimer_{start/cancel} functions call into tracing,
* calls to these functions must be bound within RCU_NONIDLE.
*/
- RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
- !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
- 0);
+ RCU_NONIDLE({
+ bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
+ if (bc_moved)
+ hrtimer_start(&bctimer, expires,
+ HRTIMER_MODE_ABS_PINNED);});
if (bc_moved) {
/* Bind the "device" to the cpu */
bc->bound_on = smp_processor_id();
@@ -79,7 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
}
static struct clock_event_device ce_broadcast_hrtimer = {
- .set_mode = bc_set_mode,
+ .set_state_shutdown = bc_shutdown,
.set_next_ktime = bc_set_next,
.features = CLOCK_EVT_FEAT_ONESHOT |
CLOCK_EVT_FEAT_KTIME |
@@ -99,10 +95,11 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
{
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
- if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
- return HRTIMER_NORESTART;
+ if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
+ if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
+ return HRTIMER_RESTART;
- return HRTIMER_RESTART;
+ return HRTIMER_NORESTART;
}
void tick_setup_hrtimer_broadcast(void)
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 066f0ec05e48..f6aae7977824 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask;
static cpumask_var_t tick_broadcast_on;
static cpumask_var_t tmpmask;
static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
-static int tick_broadcast_force;
+static int tick_broadcast_forced;
#ifdef CONFIG_TICK_ONESHOT
static void tick_broadcast_clear_oneshot(int cpu);
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
#else
static inline void tick_broadcast_clear_oneshot(int cpu) { }
+static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
#endif
/*
@@ -157,7 +159,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
{
struct clock_event_device *bc = tick_broadcast_device.evtdev;
unsigned long flags;
- int ret;
+ int ret = 0;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -219,13 +221,14 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
* If we kept the cpu in the broadcast mask,
* tell the caller to leave the per cpu device
* in shutdown state. The periodic interrupt
- * is delivered by the broadcast device.
+ * is delivered by the broadcast device, if
+ * the broadcast device exists and is not
+ * hrtimer based.
*/
- ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
+ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER))
+ ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
break;
default:
- /* Nothing to do */
- ret = 0;
break;
}
}
@@ -253,18 +256,32 @@ int tick_receive_broadcast(void)
/*
* Broadcast the event to the cpus, which are set in the mask (mangled).
*/
-static void tick_do_broadcast(struct cpumask *mask)
+static bool tick_do_broadcast(struct cpumask *mask)
{
int cpu = smp_processor_id();
struct tick_device *td;
+ bool local = false;
/*
* Check, if the current cpu is in the mask
*/
if (cpumask_test_cpu(cpu, mask)) {
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
cpumask_clear_cpu(cpu, mask);
- td = &per_cpu(tick_cpu_device, cpu);
- td->evtdev->event_handler(td->evtdev);
+ /*
+ * We only run the local handler, if the broadcast
+ * device is not hrtimer based. Otherwise we run into
+ * a hrtimer recursion.
+ *
+ * local timer_interrupt()
+ * local_handler()
+ * expire_hrtimers()
+ * bc_handler()
+ * local_handler()
+ * expire_hrtimers()
+ */
+ local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER);
}
if (!cpumask_empty(mask)) {
@@ -277,16 +294,17 @@ static void tick_do_broadcast(struct cpumask *mask)
td = &per_cpu(tick_cpu_device, cpumask_first(mask));
td->evtdev->broadcast(mask);
}
+ return local;
}
/*
* Periodic broadcast:
* - invoke the broadcast handlers
*/
-static void tick_do_periodic_broadcast(void)
+static bool tick_do_periodic_broadcast(void)
{
cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
- tick_do_broadcast(tmpmask);
+ return tick_do_broadcast(tmpmask);
}
/*
@@ -294,79 +312,91 @@ static void tick_do_periodic_broadcast(void)
*/
static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
{
- ktime_t next;
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
- tick_do_periodic_broadcast();
+ /* Handle spurious interrupts gracefully */
+ if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) {
+ raw_spin_unlock(&tick_broadcast_lock);
+ return;
+ }
- /*
- * The device is in periodic mode. No reprogramming necessary:
- */
- if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
- goto unlock;
+ bc_local = tick_do_periodic_broadcast();
- /*
- * Setup the next period for devices, which do not have
- * periodic mode. We read dev->next_event first and add to it
- * when the event already expired. clockevents_program_event()
- * sets dev->next_event only when the event is really
- * programmed to the device.
- */
- for (next = dev->next_event; ;) {
- next = ktime_add(next, tick_period);
+ if (clockevent_state_oneshot(dev)) {
+ ktime_t next = ktime_add(dev->next_event, tick_period);
- if (!clockevents_program_event(dev, next, false))
- goto unlock;
- tick_do_periodic_broadcast();
+ clockevents_program_event(dev, next, true);
}
-unlock:
raw_spin_unlock(&tick_broadcast_lock);
+
+ /*
+ * We run the handler of the local cpu after dropping
+ * tick_broadcast_lock because the handler might deadlock when
+ * trying to switch to oneshot mode.
+ */
+ if (bc_local)
+ td->evtdev->event_handler(td->evtdev);
}
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
+/**
+ * tick_broadcast_control - Enable/disable or force broadcast mode
+ * @mode: The selected broadcast mode
+ *
+ * Called when the system enters a state where affected tick devices
+ * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
*/
-static void tick_do_broadcast_on_off(unsigned long *reason)
+void tick_broadcast_control(enum tick_broadcast_mode mode)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
- unsigned long flags;
int cpu, bc_stopped;
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
- cpu = smp_processor_id();
- td = &per_cpu(tick_cpu_device, cpu);
+ td = this_cpu_ptr(&tick_cpu_device);
dev = td->evtdev;
- bc = tick_broadcast_device.evtdev;
/*
* Is the device not affected by the powerstate ?
*/
if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
- goto out;
+ return;
if (!tick_device_is_functional(dev))
- goto out;
+ return;
+ raw_spin_lock(&tick_broadcast_lock);
+ cpu = smp_processor_id();
+ bc = tick_broadcast_device.evtdev;
bc_stopped = cpumask_empty(tick_broadcast_mask);
- switch (*reason) {
- case CLOCK_EVT_NOTIFY_BROADCAST_ON:
- case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+ switch (mode) {
+ case TICK_BROADCAST_FORCE:
+ tick_broadcast_forced = 1;
+ case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
- if (tick_broadcast_device.mode ==
- TICKDEV_MODE_PERIODIC)
+ /*
+ * Only shutdown the cpu local device, if:
+ *
+ * - the broadcast device exists
+ * - the broadcast device is not a hrtimer based one
+ * - the broadcast device is in periodic mode to
+ * avoid a hickup during switch to oneshot mode
+ */
+ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) &&
+ tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
clockevents_shutdown(dev);
}
- if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
- tick_broadcast_force = 1;
break;
- case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- if (tick_broadcast_force)
+
+ case TICK_BROADCAST_OFF:
+ if (tick_broadcast_forced)
break;
cpumask_clear_cpu(cpu, tick_broadcast_on);
if (!tick_device_is_functional(dev))
@@ -379,31 +409,20 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
break;
}
- if (cpumask_empty(tick_broadcast_mask)) {
- if (!bc_stopped)
- clockevents_shutdown(bc);
- } else if (bc_stopped) {
- if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- tick_broadcast_start_periodic(bc);
- else
- tick_broadcast_setup_oneshot(bc);
+ if (bc) {
+ if (cpumask_empty(tick_broadcast_mask)) {
+ if (!bc_stopped)
+ clockevents_shutdown(bc);
+ } else if (bc_stopped) {
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ tick_broadcast_start_periodic(bc);
+ else
+ tick_broadcast_setup_oneshot(bc);
+ }
}
-out:
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-}
-
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop.
- */
-void tick_broadcast_on_off(unsigned long reason, int *oncpu)
-{
- if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
- printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
- "offline CPU #%d\n", *oncpu);
- else
- tick_do_broadcast_on_off(&reason);
+ raw_spin_unlock(&tick_broadcast_lock);
}
+EXPORT_SYMBOL_GPL(tick_broadcast_control);
/*
* Set the periodic handler depending on broadcast on/off
@@ -416,14 +435,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
dev->event_handler = tick_handle_periodic_broadcast;
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* Remove a CPU from broadcasting
*/
-void tick_shutdown_broadcast(unsigned int *cpup)
+void tick_shutdown_broadcast(unsigned int cpu)
{
struct clock_event_device *bc;
unsigned long flags;
- unsigned int cpu = *cpup;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -438,6 +457,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#endif
void tick_suspend_broadcast(void)
{
@@ -453,38 +473,48 @@ void tick_suspend_broadcast(void)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
-int tick_resume_broadcast(void)
+/*
+ * This is called from tick_resume_local() on a resuming CPU. That's
+ * called from the core resume function, tick_unfreeze() and the magic XEN
+ * resume hackery.
+ *
+ * In none of these cases the broadcast device mode can change and the
+ * bit of the resuming CPU in the broadcast mask is safe as well.
+ */
+bool tick_resume_check_broadcast(void)
+{
+ if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
+ return false;
+ else
+ return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
+}
+
+void tick_resume_broadcast(void)
{
struct clock_event_device *bc;
unsigned long flags;
- int broadcast = 0;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
bc = tick_broadcast_device.evtdev;
if (bc) {
- clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+ clockevents_tick_resume(bc);
switch (tick_broadcast_device.mode) {
case TICKDEV_MODE_PERIODIC:
if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(bc);
- broadcast = cpumask_test_cpu(smp_processor_id(),
- tick_broadcast_mask);
break;
case TICKDEV_MODE_ONESHOT:
if (!cpumask_empty(tick_broadcast_mask))
- broadcast = tick_resume_broadcast_oneshot(bc);
+ tick_resume_broadcast_oneshot(bc);
break;
}
}
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-
- return broadcast;
}
-
#ifdef CONFIG_TICK_ONESHOT
static cpumask_var_t tick_broadcast_oneshot_mask;
@@ -527,24 +557,19 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc,
irq_set_affinity(bc->irq, bc->cpumask);
}
-static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
- ktime_t expires, int force)
+static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+ ktime_t expires)
{
- int ret;
-
- if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ if (!clockevent_state_oneshot(bc))
+ clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
- ret = clockevents_program_event(bc, expires, force);
- if (!ret)
- tick_broadcast_set_affinity(bc, cpumask_of(cpu));
- return ret;
+ clockevents_program_event(bc, expires, 1);
+ tick_broadcast_set_affinity(bc, cpumask_of(cpu));
}
-int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
{
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
- return 0;
+ clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
}
/*
@@ -562,8 +587,8 @@ void tick_check_oneshot_broadcast_this_cpu(void)
* switched over, leave the device alone.
*/
if (td->mode == TICKDEV_MODE_ONESHOT) {
- clockevents_set_mode(td->evtdev,
- CLOCK_EVT_MODE_ONESHOT);
+ clockevents_switch_state(td->evtdev,
+ CLOCK_EVT_STATE_ONESHOT);
}
}
}
@@ -576,9 +601,9 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
struct tick_device *td;
ktime_t now, next_event;
int cpu, next_cpu = 0;
+ bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
-again:
dev->next_event.tv64 = KTIME_MAX;
next_event.tv64 = KTIME_MAX;
cpumask_clear(tmpmask);
@@ -620,7 +645,7 @@ again:
/*
* Wakeup the cpus which have an expired event.
*/
- tick_do_broadcast(tmpmask);
+ bc_local = tick_do_broadcast(tmpmask);
/*
* Two reasons for reprogram:
@@ -632,15 +657,15 @@ again:
* - There are pending events on sleeping CPUs which were not
* in the event mask
*/
- if (next_event.tv64 != KTIME_MAX) {
- /*
- * Rearm the broadcast device. If event expired,
- * repeat the above
- */
- if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
- goto again;
- }
+ if (next_event.tv64 != KTIME_MAX)
+ tick_broadcast_set_event(dev, next_cpu, next_event);
+
raw_spin_unlock(&tick_broadcast_lock);
+
+ if (bc_local) {
+ td = this_cpu_ptr(&tick_cpu_device);
+ td->evtdev->event_handler(td->evtdev);
+ }
}
static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
@@ -666,82 +691,88 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
if (dev->next_event.tv64 < bc->next_event.tv64)
return;
}
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
-}
-
-static void broadcast_move_bc(int deadcpu)
-{
- struct clock_event_device *bc = tick_broadcast_device.evtdev;
-
- if (!bc || !broadcast_needs_cpu(bc, deadcpu))
- return;
- /* This moves the broadcast assignment to this cpu */
- clockevents_program_event(bc, bc->next_event, 1);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
}
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
- * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
- */
-int tick_broadcast_oneshot_control(unsigned long reason)
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
{
struct clock_event_device *bc, *dev;
- struct tick_device *td;
- unsigned long flags;
- ktime_t now;
int cpu, ret = 0;
+ ktime_t now;
/*
- * Periodic mode does not care about the enter/exit of power
- * states
+ * If there is no broadcast device, tell the caller not to go
+ * into deep idle.
*/
- if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- return 0;
+ if (!tick_broadcast_device.evtdev)
+ return -EBUSY;
- /*
- * We are called with preemtion disabled from the depth of the
- * idle code, so we can't be moved away.
- */
+ dev = this_cpu_ptr(&tick_cpu_device)->evtdev;
+
+ raw_spin_lock(&tick_broadcast_lock);
+ bc = tick_broadcast_device.evtdev;
cpu = smp_processor_id();
- td = &per_cpu(tick_cpu_device, cpu);
- dev = td->evtdev;
- if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
- return 0;
+ if (state == TICK_BROADCAST_ENTER) {
+ /*
+ * If the current CPU owns the hrtimer broadcast
+ * mechanism, it cannot go deep idle and we do not add
+ * the CPU to the broadcast mask. We don't have to go
+ * through the EXIT path as the local timer is not
+ * shutdown.
+ */
+ ret = broadcast_needs_cpu(bc, cpu);
+ if (ret)
+ goto out;
- bc = tick_broadcast_device.evtdev;
+ /*
+ * If the broadcast device is in periodic mode, we
+ * return.
+ */
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
+ /* If it is a hrtimer based broadcast, return busy */
+ if (bc->features & CLOCK_EVT_FEAT_HRTIMER)
+ ret = -EBUSY;
+ goto out;
+ }
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
- if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
+
+ /* Conditionally shut down the local timer. */
broadcast_shutdown_local(bc, dev);
+
/*
* We only reprogram the broadcast timer if we
* did not mark ourself in the force mask and
* if the cpu local event is earlier than the
* broadcast event. If the current CPU is in
* the force mask, then we are going to be
- * woken by the IPI right away.
+ * woken by the IPI right away; we return
+ * busy, so the CPU does not try to go deep
+ * idle.
*/
- if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
- dev->next_event.tv64 < bc->next_event.tv64)
- tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
+ if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) {
+ ret = -EBUSY;
+ } else if (dev->next_event.tv64 < bc->next_event.tv64) {
+ tick_broadcast_set_event(bc, cpu, dev->next_event);
+ /*
+ * In case of hrtimer broadcasts the
+ * programming might have moved the
+ * timer to this cpu. If yes, remove
+ * us from the broadcast mask and
+ * return busy.
+ */
+ ret = broadcast_needs_cpu(bc, cpu);
+ if (ret) {
+ cpumask_clear_cpu(cpu,
+ tick_broadcast_oneshot_mask);
+ }
+ }
}
- /*
- * If the current CPU owns the hrtimer broadcast
- * mechanism, it cannot go deep idle and we remove the
- * CPU from the broadcast mask. We don't have to go
- * through the EXIT path as the local timer is not
- * shutdown.
- */
- ret = broadcast_needs_cpu(bc, cpu);
- if (ret)
- cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
} else {
if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
/*
* The cpu which was handling the broadcast
* timer marked this cpu in the broadcast
@@ -805,7 +836,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
}
}
out:
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+ raw_spin_unlock(&tick_broadcast_lock);
return ret;
}
@@ -842,7 +873,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
- int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+ int was_periodic = clockevent_state_periodic(bc);
bc->event_handler = tick_handle_oneshot_broadcast;
@@ -858,10 +889,10 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_broadcast_oneshot_mask, tmpmask);
if (was_periodic && !cpumask_empty(tmpmask)) {
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_init_next_event(tmpmask,
tick_next_period);
- tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
+ tick_broadcast_set_event(bc, cpu, tick_next_period);
} else
bc->next_event.tv64 = KTIME_MAX;
} else {
@@ -894,14 +925,28 @@ void tick_broadcast_switch_to_oneshot(void)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#ifdef CONFIG_HOTPLUG_CPU
+void hotplug_cpu__broadcast_tick_pull(int deadcpu)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ bc = tick_broadcast_device.evtdev;
+
+ if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /* This moves the broadcast assignment to this CPU: */
+ clockevents_program_event(bc, bc->next_event, 1);
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
/*
* Remove a dead CPU from broadcasting
*/
-void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+void tick_shutdown_broadcast_oneshot(unsigned int cpu)
{
unsigned long flags;
- unsigned int cpu = *cpup;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -913,10 +958,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
- broadcast_move_bc(cpu);
-
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#endif
/*
* Check, whether the broadcast device is in one shot mode
@@ -936,6 +980,16 @@ bool tick_broadcast_oneshot_available(void)
return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
}
+#else
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER))
+ return -EBUSY;
+
+ return 0;
+}
#endif
void __init tick_broadcast_init(void)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f7c515595b42..d11c55b6ab7d 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -19,6 +19,7 @@
#include <linux/profile.h>
#include <linux/sched.h>
#include <linux/module.h>
+#include <trace/events/power.h>
#include <asm/irq_regs.h>
@@ -102,7 +103,17 @@ void tick_handle_periodic(struct clock_event_device *dev)
tick_periodic(cpu);
- if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
+ /*
+ * The cpu might have transitioned to HIGHRES or NOHZ mode via
+ * update_process_times() -> run_local_timers() ->
+ * hrtimer_run_queues().
+ */
+ if (dev->event_handler != tick_handle_periodic)
+ return;
+#endif
+
+ if (!clockevent_state_oneshot(dev))
return;
for (;;) {
/*
@@ -140,7 +151,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
!tick_broadcast_oneshot_active()) {
- clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
} else {
unsigned long seq;
ktime_t next;
@@ -150,7 +161,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
next = tick_next_period;
} while (read_seqretry(&jiffies_lock, seq));
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
for (;;) {
if (!clockevents_program_event(dev, next, false))
@@ -293,9 +304,6 @@ void tick_check_new_device(struct clock_event_device *newdev)
int cpu;
cpu = smp_processor_id();
- if (!cpumask_test_cpu(cpu, newdev->cpumask))
- goto out_bc;
-
td = &per_cpu(tick_cpu_device, cpu);
curdev = td->evtdev;
@@ -332,14 +340,38 @@ out_bc:
tick_install_broadcast_device(newdev);
}
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state: The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
+ * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
+ */
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
+ if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 0;
+
+ return __tick_broadcast_oneshot_control(state);
+}
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
+
+#ifdef CONFIG_HOTPLUG_CPU
/*
* Transfer the do_timer job away from a dying cpu.
*
- * Called with interrupts disabled.
+ * Called with interrupts disabled. Not locking required. If
+ * tick_do_timer_cpu is owned by this cpu, nothing can change it.
*/
-void tick_handover_do_timer(int *cpup)
+void tick_handover_do_timer(void)
{
- if (*cpup == tick_do_timer_cpu) {
+ if (tick_do_timer_cpu == smp_processor_id()) {
int cpu = cpumask_first(cpu_online_mask);
tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
@@ -354,9 +386,9 @@ void tick_handover_do_timer(int *cpup)
* access the hardware device itself.
* We just set the mode and remove it from the lists.
*/
-void tick_shutdown(unsigned int *cpup)
+void tick_shutdown(unsigned int cpu)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+ struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
struct clock_event_device *dev = td->evtdev;
td->mode = TICKDEV_MODE_PERIODIC;
@@ -365,27 +397,42 @@ void tick_shutdown(unsigned int *cpup)
* Prevent that the clock events layer tries to call
* the set mode function!
*/
+ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
dev->mode = CLOCK_EVT_MODE_UNUSED;
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
}
}
+#endif
-void tick_suspend(void)
+/**
+ * tick_suspend_local - Suspend the local tick device
+ *
+ * Called from the local cpu for freeze with interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend_local(void)
{
struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
clockevents_shutdown(td->evtdev);
}
-void tick_resume(void)
+/**
+ * tick_resume_local - Resume the local tick device
+ *
+ * Called from the local CPU for unfreeze or XEN resume magic.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume_local(void)
{
struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
- int broadcast = tick_resume_broadcast();
-
- clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+ bool broadcast = tick_resume_check_broadcast();
+ clockevents_tick_resume(td->evtdev);
if (!broadcast) {
if (td->mode == TICKDEV_MODE_PERIODIC)
tick_setup_periodic(td->evtdev, 0);
@@ -394,6 +441,36 @@ void tick_resume(void)
}
}
+/**
+ * tick_suspend - Suspend the tick and the broadcast device
+ *
+ * Called from syscore_suspend() via timekeeping_suspend with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend(void)
+{
+ tick_suspend_local();
+ tick_suspend_broadcast();
+}
+
+/**
+ * tick_resume - Resume the tick and the broadcast device
+ *
+ * Called from syscore_resume() via timekeeping_resume with only one
+ * CPU online and interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume(void)
+{
+ tick_resume_broadcast();
+ tick_resume_local();
+}
+
+#ifdef CONFIG_SUSPEND
static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
static unsigned int tick_freeze_depth;
@@ -412,10 +489,11 @@ void tick_freeze(void)
tick_freeze_depth++;
if (tick_freeze_depth == num_online_cpus()) {
+ trace_suspend_resume(TPS("timekeeping_freeze"),
+ smp_processor_id(), true);
timekeeping_suspend();
} else {
- tick_suspend();
- tick_suspend_broadcast();
+ tick_suspend_local();
}
raw_spin_unlock(&tick_freeze_lock);
@@ -434,15 +512,19 @@ void tick_unfreeze(void)
{
raw_spin_lock(&tick_freeze_lock);
- if (tick_freeze_depth == num_online_cpus())
+ if (tick_freeze_depth == num_online_cpus()) {
timekeeping_resume();
- else
- tick_resume();
+ trace_suspend_resume(TPS("timekeeping_freeze"),
+ smp_processor_id(), false);
+ } else {
+ tick_resume_local();
+ }
tick_freeze_depth--;
raw_spin_unlock(&tick_freeze_lock);
}
+#endif /* CONFIG_SUSPEND */
/**
* tick_init - initialize the tick control
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 366aeb4f2c66..966a5a6fdd0a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,15 +5,12 @@
#include <linux/tick.h>
#include "timekeeping.h"
+#include "tick-sched.h"
-extern seqlock_t jiffies_lock;
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
-#define CS_NAME_LEN 32
-
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
-
-#define TICK_DO_TIMER_NONE -1
-#define TICK_DO_TIMER_BOOT -2
+# define TICK_DO_TIMER_NONE -1
+# define TICK_DO_TIMER_BOOT -2
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern ktime_t tick_next_period;
@@ -23,21 +20,83 @@ extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_handover_do_timer(int *cpup);
-extern void tick_shutdown(unsigned int *cpup);
+extern void tick_shutdown(unsigned int cpu);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
struct clock_event_device *newdev);
extern void tick_install_replacement(struct clock_event_device *dev);
+extern int tick_is_oneshot_available(void);
+extern struct tick_device *tick_get_device(int cpu);
-extern void clockevents_shutdown(struct clock_event_device *dev);
+extern int clockevents_tick_resume(struct clock_event_device *dev);
+/* Check, if the device is functional or a dummy for broadcast */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+ return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
+
+static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev)
+{
+ return dev->state_use_accessors;
+}
+static inline void clockevent_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
+{
+ dev->state_use_accessors = state;
+}
+
+extern void clockevents_shutdown(struct clock_event_device *dev);
+extern void clockevents_exchange_device(struct clock_event_device *old,
+ struct clock_event_device *new);
+extern void clockevents_switch_state(struct clock_event_device *dev,
+ enum clock_event_state state);
+extern int clockevents_program_event(struct clock_event_device *dev,
+ ktime_t expires, bool force);
+extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
-/*
- * NO_HZ / high resolution timer shared code
- */
+/* Broadcasting support */
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_suspend_broadcast(void);
+extern void tick_resume_broadcast(void);
+extern bool tick_resume_check_broadcast(void);
+extern void tick_broadcast_init(void);
+extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
+extern struct tick_device *tick_get_broadcast_device(void);
+extern struct cpumask *tick_get_broadcast_mask(void);
+# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
+static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
+static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_suspend_broadcast(void) { }
+static inline void tick_resume_broadcast(void) { }
+static inline bool tick_resume_check_broadcast(void) { return false; }
+static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
+
+/* Set the periodic handler in non broadcast mode */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+ dev->event_handler = tick_handle_periodic;
+}
+# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
+
+#else /* !GENERIC_CLOCKEVENTS: */
+static inline void tick_suspend(void) { }
+static inline void tick_resume(void) { }
+#endif /* !GENERIC_CLOCKEVENTS */
+
+/* Oneshot related functions */
#ifdef CONFIG_TICK_ONESHOT
extern void tick_setup_oneshot(struct clock_event_device *newdev,
void (*handler)(struct clock_event_device *),
@@ -46,58 +105,42 @@ extern int tick_program_event(ktime_t expires, int force);
extern void tick_oneshot_notify(void);
extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
extern void tick_resume_oneshot(void);
-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static inline bool tick_oneshot_possible(void) { return true; }
+extern int tick_oneshot_mode_active(void);
+extern void tick_clock_notify(void);
+extern int tick_check_oneshot_change(int allow_nohz);
+extern int tick_init_highres(void);
+#else /* !CONFIG_TICK_ONESHOT: */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt) { BUG(); }
+static inline void tick_resume_oneshot(void) { BUG(); }
+static inline int tick_program_event(ktime_t expires, int force) { return 0; }
+static inline void tick_oneshot_notify(void) { }
+static inline bool tick_oneshot_possible(void) { return false; }
+static inline int tick_oneshot_mode_active(void) { return 0; }
+static inline void tick_clock_notify(void) { }
+static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+#endif /* !CONFIG_TICK_ONESHOT */
+
+/* Functions related to oneshot broadcasting */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
-extern int tick_broadcast_oneshot_control(unsigned long reason);
extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
-extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
extern int tick_broadcast_oneshot_active(void);
extern void tick_check_oneshot_broadcast_this_cpu(void);
bool tick_broadcast_oneshot_available(void);
-# else /* BROADCAST */
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
- BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
+extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
+#else /* !(BROADCAST && ONESHOT): */
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
-static inline bool tick_broadcast_oneshot_available(void) { return true; }
-# endif /* !BROADCAST */
-
-#else /* !ONESHOT */
-static inline
-void tick_setup_oneshot(struct clock_event_device *newdev,
- void (*handler)(struct clock_event_device *),
- ktime_t nextevt)
-{
- BUG();
-}
-static inline void tick_resume_oneshot(void)
-{
- BUG();
-}
-static inline int tick_program_event(ktime_t expires, int force)
-{
- return 0;
-}
-static inline void tick_oneshot_notify(void) { }
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
- BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
-static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
-{
- return 0;
-}
-static inline int tick_broadcast_oneshot_active(void) { return 0; }
-static inline bool tick_broadcast_oneshot_available(void) { return false; }
-#endif /* !TICK_ONESHOT */
+static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
+#endif /* !(BROADCAST && ONESHOT) */
/* NO_HZ_FULL internal */
#ifdef CONFIG_NO_HZ_FULL
@@ -106,67 +149,18 @@ extern void tick_nohz_init(void);
static inline void tick_nohz_init(void) { }
#endif
-/*
- * Broadcasting support
- */
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern void tick_install_broadcast_device(struct clock_event_device *dev);
-extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
-extern void tick_shutdown_broadcast(unsigned int *cpup);
-extern void tick_suspend_broadcast(void);
-extern int tick_resume_broadcast(void);
-extern void tick_broadcast_init(void);
-extern void
-tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
-int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
-
-#else /* !BROADCAST */
-
-static inline void tick_install_broadcast_device(struct clock_event_device *dev)
-{
-}
-
-static inline int tick_is_broadcast_device(struct clock_event_device *dev)
-{
- return 0;
-}
-static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
- int cpu)
-{
- return 0;
-}
-static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
-static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
-static inline void tick_suspend_broadcast(void) { }
-static inline int tick_resume_broadcast(void) { return 0; }
-static inline void tick_broadcast_init(void) { }
-static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
- u32 freq) { return -ENODEV; }
-
-/*
- * Set the periodic handler in non broadcast mode
- */
-static inline void tick_set_periodic_handler(struct clock_event_device *dev,
- int broadcast)
-{
- dev->event_handler = tick_handle_periodic;
-}
-#endif /* !BROADCAST */
-
-/*
- * Check, if the device is functional or a dummy for broadcast
- */
-static inline int tick_device_is_functional(struct clock_event_device *dev)
-{
- return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
-}
-
-int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
+#ifdef CONFIG_NO_HZ_COMMON
+extern unsigned long tick_nohz_active;
+#else
+#define tick_nohz_active (0)
+#endif
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void timers_update_migration(bool update_nohz);
+#else
+static inline void timers_update_migration(bool update_nohz) { }
#endif
-extern void do_timer(unsigned long ticks);
-extern void update_wall_time(void);
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
+extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 7ce740e78e1b..b51344652330 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -28,6 +28,22 @@ int tick_program_event(ktime_t expires, int force)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+ if (unlikely(expires.tv64 == KTIME_MAX)) {
+ /*
+ * We don't need the clock event device any more, stop it.
+ */
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+ return 0;
+ }
+
+ if (unlikely(clockevent_state_oneshot_stopped(dev))) {
+ /*
+ * We need the clock event again, configure it in ONESHOT mode
+ * before using it.
+ */
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ }
+
return clockevents_program_event(dev, expires, force);
}
@@ -38,7 +54,7 @@ void tick_resume_oneshot(void)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(dev, ktime_get(), true);
}
@@ -50,7 +66,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
ktime_t next_event)
{
newdev->event_handler = handler;
- clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(newdev, next_event, true);
}
@@ -81,7 +97,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
td->mode = TICKDEV_MODE_ONESHOT;
dev->event_handler = handler;
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_switch_to_oneshot();
return 0;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a4c4edac4528..3319e16f31e5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -34,7 +34,7 @@
/*
* Per cpu nohz control structure
*/
-DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
/*
* The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -197,27 +197,9 @@ static bool can_stop_full_tick(void)
return true;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
-
-/*
- * Re-evaluate the need for the tick on the current CPU
- * and restart it if necessary.
- */
-void __tick_nohz_full_check(void)
-{
- struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
-
- if (tick_nohz_full_cpu(smp_processor_id())) {
- if (ts->tick_stopped && !is_idle_task(current)) {
- if (!can_stop_full_tick())
- tick_nohz_restart_sched_tick(ts, ktime_get());
- }
- }
-}
-
static void nohz_full_kick_work_func(struct irq_work *work)
{
- __tick_nohz_full_check();
+ /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -252,7 +234,7 @@ void tick_nohz_full_kick_cpu(int cpu)
static void nohz_full_kick_ipi(void *info)
{
- __tick_nohz_full_check();
+ /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
/*
@@ -276,7 +258,7 @@ void tick_nohz_full_kick_all(void)
* It might need the tick due to per task/process properties:
* perf events, posix cpu timers, ...
*/
-void __tick_nohz_task_switch(struct task_struct *tsk)
+void __tick_nohz_task_switch(void)
{
unsigned long flags;
@@ -399,7 +381,7 @@ void __init tick_nohz_init(void)
* NO HZ enabled ?
*/
static int tick_nohz_enabled __read_mostly = 1;
-int tick_nohz_active __read_mostly;
+unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
@@ -416,6 +398,11 @@ static int __init setup_tick_nohz(char *str)
__setup("nohz=", setup_tick_nohz);
+int tick_nohz_tick_stopped(void)
+{
+ return __this_cpu_read(tick_cpu_sched.tick_stopped);
+}
+
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
*
@@ -560,173 +547,178 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_cancel(&ts->sched_timer);
+ hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+
+ /* Forward the time to expire in the future */
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+ else
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+}
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
- unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
- ktime_t last_update, expires, ret = { .tv64 = 0 };
- unsigned long rcu_delta_jiffies;
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- u64 time_delta;
-
- time_delta = timekeeping_max_deferment();
+ u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+ unsigned long seq, basejiff;
+ ktime_t tick;
/* Read jiffies and the time when jiffies were updated last */
do {
seq = read_seqbegin(&jiffies_lock);
- last_update = last_jiffies_update;
- last_jiffies = jiffies;
+ basemono = last_jiffies_update.tv64;
+ basejiff = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
+ ts->last_jiffies = basejiff;
- if (rcu_needs_cpu(&rcu_delta_jiffies) ||
+ if (rcu_needs_cpu(basemono, &next_rcu) ||
arch_needs_cpu() || irq_work_needs_cpu()) {
- next_jiffies = last_jiffies + 1;
- delta_jiffies = 1;
+ next_tick = basemono + TICK_NSEC;
} else {
- /* Get the next timer wheel timer */
- next_jiffies = get_next_timer_interrupt(last_jiffies);
- delta_jiffies = next_jiffies - last_jiffies;
- if (rcu_delta_jiffies < delta_jiffies) {
- next_jiffies = last_jiffies + rcu_delta_jiffies;
- delta_jiffies = rcu_delta_jiffies;
- }
+ /*
+ * Get the next pending timer. If high resolution
+ * timers are enabled this only takes the timer wheel
+ * timers into account. If high resolution timers are
+ * disabled this also looks at the next expiring
+ * hrtimer.
+ */
+ next_tmr = get_next_timer_interrupt(basejiff, basemono);
+ ts->next_timer = next_tmr;
+ /* Take the next rcu event into account */
+ next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
}
/*
- * Do not stop the tick, if we are only one off (or less)
- * or if the cpu is required for RCU:
+ * If the tick is due in the next period, keep it ticking or
+ * restart it proper.
*/
- if (!ts->tick_stopped && delta_jiffies <= 1)
- goto out;
-
- /* Schedule the tick, if we are at least one jiffie off */
- if ((long)delta_jiffies >= 1) {
-
- /*
- * If this cpu is the one which updates jiffies, then
- * give up the assignment and let it be taken by the
- * cpu which runs the tick timer next, which might be
- * this cpu as well. If we don't drop this here the
- * jiffies might be stale and do_timer() never
- * invoked. Keep track of the fact that it was the one
- * which had the do_timer() duty last. If this cpu is
- * the one which had the do_timer() duty last, we
- * limit the sleep time to the timekeeping
- * max_deferement value which we retrieved
- * above. Otherwise we can sleep as long as we want.
- */
- if (cpu == tick_do_timer_cpu) {
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
- ts->do_timer_last = 1;
- } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
- time_delta = KTIME_MAX;
- ts->do_timer_last = 0;
- } else if (!ts->do_timer_last) {
- time_delta = KTIME_MAX;
+ delta = next_tick - basemono;
+ if (delta <= (u64)TICK_NSEC) {
+ tick.tv64 = 0;
+ if (!ts->tick_stopped)
+ goto out;
+ if (delta == 0) {
+ /* Tick is stopped, but required now. Enforce it */
+ tick_nohz_restart(ts, now);
+ goto out;
}
+ }
+
+ /*
+ * If this cpu is the one which updates jiffies, then give up
+ * the assignment and let it be taken by the cpu which runs
+ * the tick timer next, which might be this cpu as well. If we
+ * don't drop this here the jiffies might be stale and
+ * do_timer() never invoked. Keep track of the fact that it
+ * was the one which had the do_timer() duty last. If this cpu
+ * is the one which had the do_timer() duty last, we limit the
+ * sleep time to the timekeeping max_deferement value.
+ * Otherwise we can sleep as long as we want.
+ */
+ delta = timekeeping_max_deferment();
+ if (cpu == tick_do_timer_cpu) {
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ ts->do_timer_last = 1;
+ } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+ delta = KTIME_MAX;
+ ts->do_timer_last = 0;
+ } else if (!ts->do_timer_last) {
+ delta = KTIME_MAX;
+ }
#ifdef CONFIG_NO_HZ_FULL
- if (!ts->inidle) {
- time_delta = min(time_delta,
- scheduler_tick_max_deferment());
- }
+ /* Limit the tick delta to the maximum scheduler deferment */
+ if (!ts->inidle)
+ delta = min(delta, scheduler_tick_max_deferment());
#endif
- /*
- * calculate the expiry time for the next timer wheel
- * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
- * that there is no timer pending or at least extremely
- * far into the future (12 days for HZ=1000). In this
- * case we set the expiry to the end of time.
- */
- if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
- /*
- * Calculate the time delta for the next timer event.
- * If the time delta exceeds the maximum time delta
- * permitted by the current clocksource then adjust
- * the time delta accordingly to ensure the
- * clocksource does not wrap.
- */
- time_delta = min_t(u64, time_delta,
- tick_period.tv64 * delta_jiffies);
- }
-
- if (time_delta < KTIME_MAX)
- expires = ktime_add_ns(last_update, time_delta);
- else
- expires.tv64 = KTIME_MAX;
-
- /* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
- goto out;
+ /* Calculate the next expiry time */
+ if (delta < (KTIME_MAX - basemono))
+ expires = basemono + delta;
+ else
+ expires = KTIME_MAX;
- ret = expires;
+ expires = min_t(u64, expires, next_tick);
+ tick.tv64 = expires;
- /*
- * nohz_stop_sched_tick can be called several times before
- * the nohz_restart_sched_tick is called. This happens when
- * interrupts arrive which do not cause a reschedule. In the
- * first call we save the current tick time, so we can restart
- * the scheduler tick in nohz_restart_sched_tick.
- */
- if (!ts->tick_stopped) {
- nohz_balance_enter_idle(cpu);
- calc_load_enter_idle();
+ /* Skip reprogram of event if its not changed */
+ if (ts->tick_stopped && (expires == dev->next_event.tv64))
+ goto out;
- ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
- ts->tick_stopped = 1;
- trace_tick_stop(1, " ");
- }
+ /*
+ * nohz_stop_sched_tick can be called several times before
+ * the nohz_restart_sched_tick is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the
+ * first call we save the current tick time, so we can restart
+ * the scheduler tick in nohz_restart_sched_tick.
+ */
+ if (!ts->tick_stopped) {
+ nohz_balance_enter_idle(cpu);
+ calc_load_enter_idle();
- /*
- * If the expiration time == KTIME_MAX, then
- * in this case we simply stop the tick timer.
- */
- if (unlikely(expires.tv64 == KTIME_MAX)) {
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
- hrtimer_cancel(&ts->sched_timer);
- goto out;
- }
+ ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
+ ts->tick_stopped = 1;
+ trace_tick_stop(1, " ");
+ }
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer, expires,
- HRTIMER_MODE_ABS_PINNED);
- /* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
- goto out;
- } else if (!tick_program_event(expires, 0))
- goto out;
- /*
- * We are past the event already. So we crossed a
- * jiffie boundary. Update jiffies and raise the
- * softirq.
- */
- tick_do_update_jiffies64(ktime_get());
+ /*
+ * If the expiration time == KTIME_MAX, then we simply stop
+ * the tick timer.
+ */
+ if (unlikely(expires == KTIME_MAX)) {
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_cancel(&ts->sched_timer);
+ goto out;
}
- raise_softirq_irqoff(TIMER_SOFTIRQ);
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+ else
+ tick_program_event(tick, 1);
out:
- ts->next_jiffies = next_jiffies;
- ts->last_jiffies = last_jiffies;
+ /* Update the estimated sleep length */
ts->sleep_length = ktime_sub(dev->next_event, now);
+ return tick;
+}
+
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+{
+ /* Update jiffies first */
+ tick_do_update_jiffies64(now);
+ update_cpu_load_nohz();
+
+ calc_load_exit_idle();
+ touch_softlockup_watchdog();
+ /*
+ * Cancel the scheduled timer and restore the tick
+ */
+ ts->tick_stopped = 0;
+ ts->idle_exittime = now;
- return ret;
+ tick_nohz_restart(ts, now);
}
-static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+static void tick_nohz_full_update_tick(struct tick_sched *ts)
{
#ifdef CONFIG_NO_HZ_FULL
int cpu = smp_processor_id();
- if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+ if (!tick_nohz_full_cpu(cpu))
return;
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
- if (!can_stop_full_tick())
- return;
-
- tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ if (can_stop_full_tick())
+ tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ else if (ts->tick_stopped)
+ tick_nohz_restart_sched_tick(ts, ktime_get());
#endif
}
@@ -856,7 +848,7 @@ void tick_nohz_irq_exit(void)
if (ts->inidle)
__tick_nohz_idle_enter(ts);
else
- tick_nohz_full_stop_tick(ts);
+ tick_nohz_full_update_tick(ts);
}
/**
@@ -871,49 +863,6 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
-static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
-{
- hrtimer_cancel(&ts->sched_timer);
- hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
-
- while (1) {
- /* Forward the time to expire in the future */
- hrtimer_forward(&ts->sched_timer, now, tick_period);
-
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS_PINNED);
- /* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
- break;
- } else {
- if (!tick_program_event(
- hrtimer_get_expires(&ts->sched_timer), 0))
- break;
- }
- /* Reread time and update jiffies */
- now = ktime_get();
- tick_do_update_jiffies64(now);
- }
-}
-
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
-{
- /* Update jiffies first */
- tick_do_update_jiffies64(now);
- update_cpu_load_nohz();
-
- calc_load_exit_idle();
- touch_softlockup_watchdog();
- /*
- * Cancel the scheduled timer and restore the tick
- */
- ts->tick_stopped = 0;
- ts->idle_exittime = now;
-
- tick_nohz_restart(ts, now);
-}
-
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
@@ -967,12 +916,6 @@ void tick_nohz_idle_exit(void)
local_irq_enable();
}
-static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
-{
- hrtimer_forward(&ts->sched_timer, now, tick_period);
- return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
-}
-
/*
* The nohz low res interrupt handler
*/
@@ -991,10 +934,18 @@ static void tick_nohz_handler(struct clock_event_device *dev)
if (unlikely(ts->tick_stopped))
return;
- while (tick_nohz_reprogram(ts, now)) {
- now = ktime_get();
- tick_do_update_jiffies64(now);
- }
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+}
+
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
+{
+ if (!tick_nohz_enabled)
+ return;
+ ts->nohz_mode = mode;
+ /* One update is enough */
+ if (!test_and_set_bit(0, &tick_nohz_active))
+ timers_update_migration(true);
}
/**
@@ -1008,13 +959,8 @@ static void tick_nohz_switch_to_nohz(void)
if (!tick_nohz_enabled)
return;
- local_irq_disable();
- if (tick_switch_to_oneshot(tick_nohz_handler)) {
- local_irq_enable();
+ if (tick_switch_to_oneshot(tick_nohz_handler))
return;
- }
- tick_nohz_active = 1;
- ts->nohz_mode = NOHZ_MODE_LOWRES;
/*
* Recycle the hrtimer in ts, so we can share the
@@ -1024,13 +970,10 @@ static void tick_nohz_switch_to_nohz(void)
/* Get the next period */
next = tick_init_jiffy_update();
- for (;;) {
- hrtimer_set_expires(&ts->sched_timer, next);
- if (!tick_program_event(next, 0))
- break;
- next = ktime_add(next, tick_period);
- }
- local_irq_enable();
+ hrtimer_forward_now(&ts->sched_timer, tick_period);
+ hrtimer_set_expires(&ts->sched_timer, next);
+ tick_program_event(next, 1);
+ tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
/*
@@ -1082,6 +1025,7 @@ static inline void tick_nohz_irq_enter(void)
static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_nohz_irq_enter(void) { }
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -1162,22 +1106,9 @@ void tick_setup_sched_timer(void)
hrtimer_add_expires_ns(&ts->sched_timer, offset);
}
- for (;;) {
- hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS_PINNED);
- /* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
- break;
- now = ktime_get();
- }
-
-#ifdef CONFIG_NO_HZ_COMMON
- if (tick_nohz_enabled) {
- ts->nohz_mode = NOHZ_MODE_HIGHRES;
- tick_nohz_active = 1;
- }
-#endif
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+ tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
}
#endif /* HIGH_RES_TIMERS */
@@ -1222,7 +1153,7 @@ void tick_oneshot_notify(void)
* Called cyclic from the hrtimer softirq (driven by the timer
* softirq) allow_nohz signals, that we can switch into low-res nohz
* mode, because high resolution timers are disabled (either compile
- * or runtime).
+ * or runtime). Called with interrupts disabled.
*/
int tick_check_oneshot_change(int allow_nohz)
{
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
new file mode 100644
index 000000000000..a4a8d4e9baa1
--- /dev/null
+++ b/kernel/time/tick-sched.h
@@ -0,0 +1,84 @@
+#ifndef _TICK_SCHED_H
+#define _TICK_SCHED_H
+
+#include <linux/hrtimer.h>
+
+enum tick_device_mode {
+ TICKDEV_MODE_PERIODIC,
+ TICKDEV_MODE_ONESHOT,
+};
+
+struct tick_device {
+ struct clock_event_device *evtdev;
+ enum tick_device_mode mode;
+};
+
+enum tick_nohz_mode {
+ NOHZ_MODE_INACTIVE,
+ NOHZ_MODE_LOWRES,
+ NOHZ_MODE_HIGHRES,
+};
+
+/**
+ * struct tick_sched - sched tick emulation and no idle tick control/stats
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
+ * @last_tick: Store the last tick expiry time when the tick
+ * timer is modified for nohz sleeps. This is necessary
+ * to resume the tick timer operation in the timeline
+ * when the CPU returns from nohz sleep.
+ * @tick_stopped: Indicator that the idle tick has been stopped
+ * @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_calls: Total number of idle calls
+ * @idle_sleeps: Number of idle calls, where the sched tick was stopped
+ * @idle_entrytime: Time when the idle call was entered
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_exittime: Time when the idle state was left
+ * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
+ * @sleep_length: Duration of the current idle sleep
+ * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ */
+struct tick_sched {
+ struct hrtimer sched_timer;
+ unsigned long check_clocks;
+ enum tick_nohz_mode nohz_mode;
+ ktime_t last_tick;
+ int inidle;
+ int tick_stopped;
+ unsigned long idle_jiffies;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+ int idle_active;
+ ktime_t idle_entrytime;
+ ktime_t idle_waketime;
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+ ktime_t sleep_length;
+ unsigned long last_jiffies;
+ u64 next_timer;
+ ktime_t idle_expires;
+ int do_timer_last;
+};
+
+extern struct tick_sched *tick_get_tick_sched(int cpu);
+
+extern void tick_setup_sched_timer(void);
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
+extern void tick_cancel_sched_timer(int cpu);
+#else
+static inline void tick_cancel_sched_timer(int cpu) { }
+#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+#else
+static inline int
+__tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ return -EBUSY;
+}
+#endif
+
+#endif
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2c85b7724af4..86751c68e08d 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -41,7 +41,7 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
-#include "timeconst.h"
+#include <generated/timeconst.h>
#include "timekeeping.h"
/*
@@ -173,6 +173,10 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
return error;
if (tz) {
+ /* Verify we're witin the +-15 hrs range */
+ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
+ return -EINVAL;
+
sys_tz = *tz;
update_vsyscall_tz();
if (firsttime) {
@@ -264,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs);
unsigned int jiffies_to_usecs(const unsigned long j)
{
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ /*
+ * Hz usually doesn't go much further MSEC_PER_SEC.
+ * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
+ */
+ BUILD_BUG_ON(HZ > USEC_PER_SEC);
+
+#if !(USEC_PER_SEC % HZ)
return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
- return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
@@ -283,26 +291,20 @@ EXPORT_SYMBOL(jiffies_to_usecs);
* @t: Timespec
* @gran: Granularity in ns.
*
- * Truncate a timespec to a granularity. gran must be smaller than a second.
- * Always rounds down.
- *
- * This function should be only used for timestamps returned by
- * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
- * it doesn't handle the better resolution of the latter.
+ * Truncate a timespec to a granularity. Always rounds down. gran must
+ * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
*/
struct timespec timespec_trunc(struct timespec t, unsigned gran)
{
- /*
- * Division is pretty slow so avoid it for common cases.
- * Currently current_kernel_time() never returns better than
- * jiffies resolution. Exploit that.
- */
- if (gran <= jiffies_to_usecs(1) * 1000) {
+ /* Avoid division in the common cases 1 ns and 1 s. */
+ if (gran == 1) {
/* nothing */
- } else if (gran == 1000000000) {
+ } else if (gran == NSEC_PER_SEC) {
t.tv_nsec = 0;
- } else {
+ } else if (gran > 1 && gran < NSEC_PER_SEC) {
t.tv_nsec -= t.tv_nsec % gran;
+ } else {
+ WARN(1, "illegal file time granularity: %u", gran);
}
return t;
}
@@ -483,9 +485,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
}
EXPORT_SYMBOL(ns_to_timespec64);
#endif
-/*
- * When we convert to jiffies then we interpret incoming values
- * the following way:
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m: time in milliseconds
+ *
+ * conversion is done as follows:
*
* - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
*
@@ -493,66 +497,36 @@ EXPORT_SYMBOL(ns_to_timespec64);
* MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
*
* - all other values are converted to jiffies by either multiplying
- * the input value by a factor or dividing it with a factor
- *
- * We must also be careful about 32-bit overflows.
+ * the input value by a factor or dividing it with a factor and
+ * handling any 32-bit overflows.
+ * for the details see __msecs_to_jiffies()
+ *
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the _msecs_to_jiffies helpers are the HZ dependent conversion
+ * routines found in include/linux/jiffies.h
*/
-unsigned long msecs_to_jiffies(const unsigned int m)
+unsigned long __msecs_to_jiffies(const unsigned int m)
{
/*
* Negative value, means infinite timeout:
*/
if ((int)m < 0)
return MAX_JIFFY_OFFSET;
-
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- /*
- * HZ is equal to or smaller than 1000, and 1000 is a nice
- * round multiple of HZ, divide with the factor between them,
- * but round upwards:
- */
- return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
- /*
- * HZ is larger than 1000, and HZ is a nice round multiple of
- * 1000 - simply multiply with the factor between them.
- *
- * But first make sure the multiplication result cannot
- * overflow:
- */
- if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
- return MAX_JIFFY_OFFSET;
-
- return m * (HZ / MSEC_PER_SEC);
-#else
- /*
- * Generic case - multiply, round and divide. But first
- * check that if we are doing a net multiplication, that
- * we wouldn't overflow:
- */
- if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
- return MAX_JIFFY_OFFSET;
-
- return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
- >> MSEC_TO_HZ_SHR32;
-#endif
+ return _msecs_to_jiffies(m);
}
-EXPORT_SYMBOL(msecs_to_jiffies);
+EXPORT_SYMBOL(__msecs_to_jiffies);
-unsigned long usecs_to_jiffies(const unsigned int u)
+unsigned long __usecs_to_jiffies(const unsigned int u)
{
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
return MAX_JIFFY_OFFSET;
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
- return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
- return u * (HZ / USEC_PER_SEC);
-#else
- return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
- >> USEC_TO_HZ_SHR32;
-#endif
+ return _usecs_to_jiffies(u);
}
-EXPORT_SYMBOL(usecs_to_jiffies);
+EXPORT_SYMBOL(__usecs_to_jiffies);
/*
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
@@ -570,7 +544,7 @@ EXPORT_SYMBOL(usecs_to_jiffies);
* value to a scaled second value.
*/
static unsigned long
-__timespec_to_jiffies(unsigned long sec, long nsec)
+__timespec64_to_jiffies(u64 sec, long nsec)
{
nsec = nsec + TICK_NSEC - 1;
@@ -578,22 +552,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec)
sec = MAX_SEC_IN_JIFFIES;
nsec = 0;
}
- return (((u64)sec * SEC_CONVERSION) +
+ return ((sec * SEC_CONVERSION) +
(((u64)nsec * NSEC_CONVERSION) >>
(NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
}
-unsigned long
-timespec_to_jiffies(const struct timespec *value)
+static unsigned long
+__timespec_to_jiffies(unsigned long sec, long nsec)
{
- return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+ return __timespec64_to_jiffies((u64)sec, nsec);
}
-EXPORT_SYMBOL(timespec_to_jiffies);
+unsigned long
+timespec64_to_jiffies(const struct timespec64 *value)
+{
+ return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec);
+}
+EXPORT_SYMBOL(timespec64_to_jiffies);
void
-jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
/*
* Convert jiffies to nanoseconds and separate with
@@ -604,7 +583,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
NSEC_PER_SEC, &rem);
value->tv_nsec = rem;
}
-EXPORT_SYMBOL(jiffies_to_timespec);
+EXPORT_SYMBOL(jiffies_to_timespec64);
/*
* We could use a similar algorithm to timespec_to_jiffies (with a
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2cafda..c7388dee8635 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -50,7 +50,7 @@ define timeconst(hz) {
print "#include <linux/types.h>\n\n"
print "#if HZ != ", hz, "\n"
- print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+ print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n"
print "#endif\n\n"
if (hz < 2) {
@@ -105,4 +105,5 @@ define timeconst(hz) {
halt
}
+hz = read();
timeconst(hz)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 91db94136c10..f6ee2e6b6f5d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -59,17 +59,15 @@ struct tk_fast {
};
static struct tk_fast tk_fast_mono ____cacheline_aligned;
+static struct tk_fast tk_fast_raw ____cacheline_aligned;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
-/* Flag for if there is a persistent clock on this platform */
-bool __read_mostly persistent_clock_exist = false;
-
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
- while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
- tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
+ while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
+ tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
tk->xtime_sec++;
}
}
@@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
struct timespec64 ts;
ts.tv_sec = tk->xtime_sec;
- ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+ ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
return ts;
}
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
- tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
+ tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec += ts->tv_sec;
- tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
+ tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
}
@@ -118,6 +116,106 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->offs_boot = ktime_add(tk->offs_boot, delta);
}
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
+
+static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+
+ cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+ const char *name = tk->tkr_mono.clock->name;
+
+ if (offset > max_cycles) {
+ printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
+ offset, name, max_cycles);
+ printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
+ } else {
+ if (offset > (max_cycles >> 1)) {
+ printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+ offset, name, max_cycles >> 1);
+ printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
+ }
+ }
+
+ if (tk->underflow_seen) {
+ if (jiffies - tk->last_warning > WARNING_FREQ) {
+ printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
+ printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
+ printk_deferred(" Your kernel is probably still fine.\n");
+ tk->last_warning = jiffies;
+ }
+ tk->underflow_seen = 0;
+ }
+
+ if (tk->overflow_seen) {
+ if (jiffies - tk->last_warning > WARNING_FREQ) {
+ printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
+ printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
+ printk_deferred(" Your kernel is probably still fine.\n");
+ tk->last_warning = jiffies;
+ }
+ tk->overflow_seen = 0;
+ }
+}
+
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ cycle_t now, last, mask, max, delta;
+ unsigned int seq;
+
+ /*
+ * Since we're called holding a seqlock, the data may shift
+ * under us while we're doing the calculation. This can cause
+ * false positives, since we'd note a problem but throw the
+ * results away. So nest another seqlock here to atomically
+ * grab the points we are checking with.
+ */
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ now = tkr->read(tkr->clock);
+ last = tkr->cycle_last;
+ mask = tkr->mask;
+ max = tkr->clock->max_cycles;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ delta = clocksource_delta(now, last, mask);
+
+ /*
+ * Try to catch underflows by checking if we are seeing small
+ * mask-relative negative values.
+ */
+ if (unlikely((~delta & mask) < (mask >> 3))) {
+ tk->underflow_seen = 1;
+ delta = 0;
+ }
+
+ /* Cap delta value to the max_cycles values to avoid mult overflows */
+ if (unlikely(delta > max)) {
+ tk->overflow_seen = 1;
+ delta = tkr->clock->max_cycles;
+ }
+
+ return delta;
+}
+#else
+static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+}
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+ cycle_t cycle_now, delta;
+
+ /* read clocksource */
+ cycle_now = tkr->read(tkr->clock);
+
+ /* calculate the delta since the last update_wall_time */
+ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+
+ return delta;
+}
+#endif
+
/**
* tk_setup_internals - Set up internals to use clocksource clock.
*
@@ -135,11 +233,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
u64 tmp, ntpinterval;
struct clocksource *old_clock;
- old_clock = tk->tkr.clock;
- tk->tkr.clock = clock;
- tk->tkr.read = clock->read;
- tk->tkr.mask = clock->mask;
- tk->tkr.cycle_last = tk->tkr.read(clock);
+ old_clock = tk->tkr_mono.clock;
+ tk->tkr_mono.clock = clock;
+ tk->tkr_mono.read = clock->read;
+ tk->tkr_mono.mask = clock->mask;
+ tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+
+ tk->tkr_raw.clock = clock;
+ tk->tkr_raw.read = clock->read;
+ tk->tkr_raw.mask = clock->mask;
+ tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
@@ -163,11 +266,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
if (shift_change < 0)
- tk->tkr.xtime_nsec >>= -shift_change;
+ tk->tkr_mono.xtime_nsec >>= -shift_change;
else
- tk->tkr.xtime_nsec <<= shift_change;
+ tk->tkr_mono.xtime_nsec <<= shift_change;
}
- tk->tkr.shift = clock->shift;
+ tk->tkr_raw.xtime_nsec = 0;
+
+ tk->tkr_mono.shift = clock->shift;
+ tk->tkr_raw.shift = clock->shift;
tk->ntp_error = 0;
tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -178,7 +284,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
* active clocksource. These value will be adjusted via NTP
* to counteract clock drifting.
*/
- tk->tkr.mult = clock->mult;
+ tk->tkr_mono.mult = clock->mult;
+ tk->tkr_raw.mult = clock->mult;
tk->ntp_err_mult = 0;
}
@@ -193,14 +300,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
{
- cycle_t cycle_now, delta;
+ cycle_t delta;
s64 nsec;
- /* read clocksource: */
- cycle_now = tkr->read(tkr->clock);
-
- /* calculate the delta since the last update_wall_time: */
- delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+ delta = timekeeping_get_delta(tkr);
nsec = delta * tkr->mult + tkr->xtime_nsec;
nsec >>= tkr->shift;
@@ -209,25 +312,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
return nsec + arch_gettimeoffset();
}
-static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
-{
- struct clocksource *clock = tk->tkr.clock;
- cycle_t cycle_now, delta;
- s64 nsec;
-
- /* read clocksource: */
- cycle_now = tk->tkr.read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-
- /* convert delta to nanoseconds. */
- nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
-
- /* If arch requires, add in get_arch_timeoffset() */
- return nsec + arch_gettimeoffset();
-}
-
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
* @tkr: Timekeeping readout base from which we take the update
@@ -235,50 +319,25 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
*
- * So we handle this differently than the other timekeeping accessor
- * functions which retry when the sequence count has changed. The
- * update side does:
- *
- * smp_wmb(); <- Ensure that the last base[1] update is visible
- * tkf->seq++;
- * smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[0], tkr);
- * smp_wmb(); <- Ensure that the base[0] update is visible
- * tkf->seq++;
- * smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[1], tkr);
- *
- * The reader side does:
- *
- * do {
- * seq = tkf->seq;
- * smp_rmb();
- * idx = seq & 0x01;
- * now = now(tkf->base[idx]);
- * smp_rmb();
- * } while (seq != tkf->seq)
- *
- * As long as we update base[0] readers are forced off to
- * base[1]. Once base[0] is updated readers are redirected to base[0]
- * and the base[1] update takes place.
+ * Employ the latch technique; see @raw_write_seqcount_latch.
*
* So if a NMI hits the update of base[0] then it will use base[1]
* which is still consistent. In the worst case this can result is a
* slightly wrong timestamp (a few nanoseconds). See
* @ktime_get_mono_fast_ns.
*/
-static void update_fast_timekeeper(struct tk_read_base *tkr)
+static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
{
- struct tk_read_base *base = tk_fast_mono.base;
+ struct tk_read_base *base = tkf->base;
/* Force readers off to base[1] */
- raw_write_seqcount_latch(&tk_fast_mono.seq);
+ raw_write_seqcount_latch(&tkf->seq);
/* Update base[0] */
memcpy(base, tkr, sizeof(*base));
/* Force readers back to base[0] */
- raw_write_seqcount_latch(&tk_fast_mono.seq);
+ raw_write_seqcount_latch(&tkf->seq);
/* Update base[1] */
memcpy(base + 1, base, sizeof(*base));
@@ -316,22 +375,33 @@ static void update_fast_timekeeper(struct tk_read_base *tkr)
* of the following timestamps. Callers need to be aware of that and
* deal with it.
*/
-u64 notrace ktime_get_mono_fast_ns(void)
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
struct tk_read_base *tkr;
unsigned int seq;
u64 now;
do {
- seq = raw_read_seqcount(&tk_fast_mono.seq);
- tkr = tk_fast_mono.base + (seq & 0x01);
- now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+ seq = raw_read_seqcount_latch(&tkf->seq);
+ tkr = tkf->base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+ } while (read_seqcount_retry(&tkf->seq, seq));
- } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
return now;
}
+
+u64 ktime_get_mono_fast_ns(void)
+{
+ return __ktime_get_fast_ns(&tk_fast_mono);
+}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+u64 ktime_get_raw_fast_ns(void)
+{
+ return __ktime_get_fast_ns(&tk_fast_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+
/* Suspend-time cycles value for halted fast timekeeper. */
static cycle_t cycles_at_suspend;
@@ -353,12 +423,17 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
static void halt_fast_timekeeper(struct timekeeper *tk)
{
static struct tk_read_base tkr_dummy;
- struct tk_read_base *tkr = &tk->tkr;
+ struct tk_read_base *tkr = &tk->tkr_mono;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
cycles_at_suspend = tkr->read(tkr->clock);
tkr_dummy.read = dummy_clock_read;
- update_fast_timekeeper(&tkr_dummy);
+ update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
+
+ tkr = &tk->tkr_raw;
+ memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+ tkr_dummy.read = dummy_clock_read;
+ update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
@@ -369,8 +444,8 @@ static inline void update_vsyscall(struct timekeeper *tk)
xt = timespec64_to_timespec(tk_xtime(tk));
wm = timespec64_to_timespec(tk->wall_to_monotonic);
- update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
- tk->tkr.cycle_last);
+ update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
+ tk->tkr_mono.cycle_last);
}
static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -387,11 +462,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
* users are removed, this can be killed.
*/
- remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
- tk->tkr.xtime_nsec -= remainder;
- tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+ remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
+ tk->tkr_mono.xtime_nsec -= remainder;
+ tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+ tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
}
#else
#define old_vsyscall_fixup(tk)
@@ -440,6 +515,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
/*
+ * tk_update_leap_state - helper to update the next_leap_ktime
+ */
+static inline void tk_update_leap_state(struct timekeeper *tk)
+{
+ tk->next_leap_ktime = ntp_get_next_leap();
+ if (tk->next_leap_ktime.tv64 != KTIME_MAX)
+ /* Convert to monotonic time */
+ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
+}
+
+/*
* Update the ktime_t based scalar nsec members of the timekeeper
*/
static inline void tk_update_ktime_data(struct timekeeper *tk)
@@ -456,17 +542,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
*/
seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
nsec = (u32) tk->wall_to_monotonic.tv_nsec;
- tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+ tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
/* Update the monotonic raw base */
- tk->base_raw = timespec64_to_ktime(tk->raw_time);
+ tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater/equal one second. Take
* this into account before updating tk->ktime_sec.
*/
- nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+ nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
if (nsec >= NSEC_PER_SEC)
seconds++;
tk->ktime_sec = seconds;
@@ -480,16 +566,25 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
ntp_clear();
}
+ tk_update_leap_state(tk);
tk_update_ktime_data(tk);
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+
+ if (action & TK_CLOCK_WAS_SET)
+ tk->clock_was_set_seq++;
+ /*
+ * The mirroring of the data to the shadow-timekeeper needs
+ * to happen last here to ensure we don't over-write the
+ * timekeeper structure on the next update with stale data
+ */
if (action & TK_MIRROR)
memcpy(&shadow_timekeeper, &tk_core.timekeeper,
sizeof(tk_core.timekeeper));
-
- update_fast_timekeeper(&tk->tkr);
}
/**
@@ -501,22 +596,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- struct clocksource *clock = tk->tkr.clock;
+ struct clocksource *clock = tk->tkr_mono.clock;
cycle_t cycle_now, delta;
s64 nsec;
- cycle_now = tk->tkr.read(clock);
- delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
- tk->tkr.cycle_last = cycle_now;
+ cycle_now = tk->tkr_mono.read(clock);
+ delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+ tk->tkr_mono.cycle_last = cycle_now;
+ tk->tkr_raw.cycle_last = cycle_now;
- tk->tkr.xtime_nsec += delta * tk->tkr.mult;
+ tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
/* If arch requires, add in get_arch_timeoffset() */
- tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
+ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
- nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
+ nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
timespec64_add_ns(&tk->raw_time, nsec);
}
@@ -537,7 +633,7 @@ int __getnstimeofday64(struct timespec64 *ts)
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsecs = timekeeping_get_ns(&tk->tkr);
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -577,8 +673,8 @@ ktime_t ktime_get(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->tkr.base_mono;
- nsecs = timekeeping_get_ns(&tk->tkr);
+ base = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -586,6 +682,23 @@ ktime_t ktime_get(void)
}
EXPORT_SYMBOL_GPL(ktime_get);
+u32 ktime_get_resolution_ns(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ u32 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return nsecs;
+}
+EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
+
static ktime_t *offsets[TK_OFFS_MAX] = {
[TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
[TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
@@ -603,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = ktime_add(tk->tkr.base_mono, *offset);
- nsecs = timekeeping_get_ns(&tk->tkr);
+ base = ktime_add(tk->tkr_mono.base, *offset);
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -645,8 +758,8 @@ ktime_t ktime_get_raw(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->base_raw;
- nsecs = timekeeping_get_ns_raw(tk);
+ base = tk->tkr_raw.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_raw);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -674,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsec = timekeeping_get_ns(&tk->tkr);
+ nsec = timekeeping_get_ns(&tk->tkr_mono);
tomono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -759,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
ts_real->tv_sec = tk->xtime_sec;
ts_real->tv_nsec = 0;
- nsecs_raw = timekeeping_get_ns_raw(tk);
- nsecs_real = timekeeping_get_ns(&tk->tkr);
+ nsecs_raw = timekeeping_get_ns(&tk->tkr_raw);
+ nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -798,6 +911,7 @@ int do_settimeofday64(const struct timespec64 *ts)
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts_delta, xt;
unsigned long flags;
+ int ret = 0;
if (!timespec64_valid_strict(ts))
return -EINVAL;
@@ -811,10 +925,15 @@ int do_settimeofday64(const struct timespec64 *ts)
ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
+ if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
tk_set_xtime(tk, ts);
-
+out:
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
@@ -823,7 +942,7 @@ int do_settimeofday64(const struct timespec64 *ts)
/* signal hrtimers about time change */
clock_was_set();
- return 0;
+ return ret;
}
EXPORT_SYMBOL(do_settimeofday64);
@@ -852,7 +971,8 @@ int timekeeping_inject_offset(struct timespec *ts)
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tk), ts64);
- if (!timespec64_valid_strict(&tmp)) {
+ if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 ||
+ !timespec64_valid_strict(&tmp)) {
ret = -EINVAL;
goto error;
}
@@ -943,7 +1063,7 @@ static int change_clocksource(void *data)
*/
if (try_module_get(new->owner)) {
if (!new->enable || new->enable(new) == 0) {
- old = tk->tkr.clock;
+ old = tk->tkr_mono.clock;
tk_setup_internals(tk, new);
if (old->disable)
old->disable(old);
@@ -971,11 +1091,11 @@ int timekeeping_notify(struct clocksource *clock)
{
struct timekeeper *tk = &tk_core.timekeeper;
- if (tk->tkr.clock == clock)
+ if (tk->tkr_mono.clock == clock)
return 0;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
- return tk->tkr.clock == clock ? 0 : -1;
+ return tk->tkr_mono.clock == clock ? 0 : -1;
}
/**
@@ -993,7 +1113,7 @@ void getrawmonotonic64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- nsecs = timekeeping_get_ns_raw(tk);
+ nsecs = timekeeping_get_ns(&tk->tkr_raw);
ts64 = tk->raw_time;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1016,7 +1136,7 @@ int timekeeping_valid_for_hres(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1035,7 +1155,7 @@ u64 timekeeping_max_deferment(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->tkr.clock->max_idle_ns;
+ ret = tk->tkr_mono.clock->max_idle_ns;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1057,21 +1177,35 @@ void __weak read_persistent_clock(struct timespec *ts)
ts->tv_nsec = 0;
}
+void __weak read_persistent_clock64(struct timespec64 *ts64)
+{
+ struct timespec ts;
+
+ read_persistent_clock(&ts);
+ *ts64 = timespec_to_timespec64(ts);
+}
+
/**
- * read_boot_clock - Return time of the system start.
+ * read_boot_clock64 - Return time of the system start.
*
* Weak dummy function for arches that do not yet support it.
* Function to read the exact time the system has been started.
- * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported.
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __weak read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock64(struct timespec64 *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
}
+/* Flag for if timekeeping_resume() has injected sleeptime */
+static bool sleeptime_injected;
+
+/* Flag for if there is a persistent clock on this platform */
+static bool persistent_clock_exists;
+
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
@@ -1081,20 +1215,17 @@ void __init timekeeping_init(void)
struct clocksource *clock;
unsigned long flags;
struct timespec64 now, boot, tmp;
- struct timespec ts;
- read_persistent_clock(&ts);
- now = timespec_to_timespec64(ts);
+ read_persistent_clock64(&now);
if (!timespec64_valid_strict(&now)) {
pr_warn("WARNING: Persistent clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
now.tv_sec = 0;
now.tv_nsec = 0;
} else if (now.tv_sec || now.tv_nsec)
- persistent_clock_exist = true;
+ persistent_clock_exists = true;
- read_boot_clock(&ts);
- boot = timespec_to_timespec64(ts);
+ read_boot_clock64(&boot);
if (!timespec64_valid_strict(&boot)) {
pr_warn("WARNING: Boot clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
@@ -1114,7 +1245,6 @@ void __init timekeeping_init(void)
tk_set_xtime(tk, &now);
tk->raw_time.tv_sec = 0;
tk->raw_time.tv_nsec = 0;
- tk->base_raw.tv64 = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
boot = tk_xtime(tk);
@@ -1127,7 +1257,7 @@ void __init timekeeping_init(void)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
-/* time in seconds when suspend began */
+/* time in seconds when suspend began for persistent clock */
static struct timespec64 timekeeping_suspend_time;
/**
@@ -1152,12 +1282,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
tk_debug_account_sleep_time(delta);
}
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
+/**
+ * We have three kinds of time sources to use for sleep time
+ * injection, the preference order is:
+ * 1) non-stop clocksource
+ * 2) persistent clock (ie: RTC accessible when irqs are off)
+ * 3) RTC
+ *
+ * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
+ * If system has neither 1) nor 2), 3) will be used finally.
+ *
+ *
+ * If timekeeping has injected sleeptime via either 1) or 2),
+ * 3) becomes needless, so in this case we don't need to call
+ * rtc_resume(), and this is what timekeeping_rtc_skipresume()
+ * means.
+ */
+bool timekeeping_rtc_skipresume(void)
+{
+ return sleeptime_injected;
+}
+
+/**
+ * 1) can be determined whether to use or not only when doing
+ * timekeeping_resume() which is invoked after rtc_suspend(),
+ * so we can't skip rtc_suspend() surely if system has 1).
+ *
+ * But if system has 2), 2) will definitely be used, so in this
+ * case we don't need to call rtc_suspend(), and this is what
+ * timekeeping_rtc_skipsuspend() means.
+ */
+bool timekeeping_rtc_skipsuspend(void)
+{
+ return persistent_clock_exists;
+}
+
/**
* timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
* @delta: pointer to a timespec64 delta value
*
- * This hook is for architectures that cannot support read_persistent_clock
+ * This hook is for architectures that cannot support read_persistent_clock64
* because their RTC/persistent clock is only accessible when irqs are enabled.
+ * and also don't have an effective nonstop clocksource.
*
* This function should only be called by rtc_resume(), and allows
* a suspend offset to be injected into the timekeeping values.
@@ -1167,13 +1334,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- /*
- * Make sure we don't set the clock twice, as timekeeping_resume()
- * already did it
- */
- if (has_persistent_clock())
- return;
-
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
@@ -1189,26 +1349,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
/* signal hrtimers about time change */
clock_was_set();
}
+#endif
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
*/
void timekeeping_resume(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *clock = tk->tkr.clock;
+ struct clocksource *clock = tk->tkr_mono.clock;
unsigned long flags;
struct timespec64 ts_new, ts_delta;
- struct timespec tmp;
cycle_t cycle_now, cycle_delta;
- bool suspendtime_found = false;
- read_persistent_clock(&tmp);
- ts_new = timespec_to_timespec64(tmp);
+ sleeptime_injected = false;
+ read_persistent_clock64(&ts_new);
clockevents_resume();
clocksource_resume();
@@ -1228,16 +1383,16 @@ void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk->tkr.read(clock);
+ cycle_now = tk->tkr_mono.read(clock);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
- cycle_now > tk->tkr.cycle_last) {
+ cycle_now > tk->tkr_mono.cycle_last) {
u64 num, max = ULLONG_MAX;
u32 mult = clock->mult;
u32 shift = clock->shift;
s64 nsec = 0;
- cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
- tk->tkr.mask);
+ cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+ tk->tkr_mono.mask);
/*
* "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1253,17 +1408,19 @@ void timekeeping_resume(void)
nsec += ((u64) cycle_delta * mult) >> shift;
ts_delta = ns_to_timespec64(nsec);
- suspendtime_found = true;
+ sleeptime_injected = true;
} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
- suspendtime_found = true;
+ sleeptime_injected = true;
}
- if (suspendtime_found)
+ if (sleeptime_injected)
__timekeeping_inject_sleeptime(tk, &ts_delta);
/* Re-base the last cycle value */
- tk->tkr.cycle_last = cycle_now;
+ tk->tkr_mono.cycle_last = cycle_now;
+ tk->tkr_raw.cycle_last = cycle_now;
+
tk->ntp_error = 0;
timekeeping_suspended = 0;
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1272,9 +1429,7 @@ void timekeeping_resume(void)
touch_softlockup_watchdog();
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
- /* Resume hrtimers */
+ tick_resume();
hrtimers_resume();
}
@@ -1284,10 +1439,8 @@ int timekeeping_suspend(void)
unsigned long flags;
struct timespec64 delta, delta_delta;
static struct timespec64 old_delta;
- struct timespec tmp;
- read_persistent_clock(&tmp);
- timekeeping_suspend_time = timespec_to_timespec64(tmp);
+ read_persistent_clock64(&timekeeping_suspend_time);
/*
* On some systems the persistent_clock can not be detected at
@@ -1295,31 +1448,33 @@ int timekeeping_suspend(void)
* value returned, update the persistent_clock_exists flag.
*/
if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
- persistent_clock_exist = true;
+ persistent_clock_exists = true;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
timekeeping_suspended = 1;
- /*
- * To avoid drift caused by repeated suspend/resumes,
- * which each can add ~1 second drift error,
- * try to compensate so the difference in system time
- * and persistent_clock time stays close to constant.
- */
- delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
- delta_delta = timespec64_sub(delta, old_delta);
- if (abs(delta_delta.tv_sec) >= 2) {
+ if (persistent_clock_exists) {
/*
- * if delta_delta is too large, assume time correction
- * has occured and set old_delta to the current delta.
+ * To avoid drift caused by repeated suspend/resumes,
+ * which each can add ~1 second drift error,
+ * try to compensate so the difference in system time
+ * and persistent_clock time stays close to constant.
*/
- old_delta = delta;
- } else {
- /* Otherwise try to adjust old_system to compensate */
- timekeeping_suspend_time =
- timespec64_add(timekeeping_suspend_time, delta_delta);
+ delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
+ delta_delta = timespec64_sub(delta, old_delta);
+ if (abs(delta_delta.tv_sec) >= 2) {
+ /*
+ * if delta_delta is too large, assume time correction
+ * has occurred and set old_delta to the current delta.
+ */
+ old_delta = delta;
+ } else {
+ /* Otherwise try to adjust old_system to compensate */
+ timekeeping_suspend_time =
+ timespec64_add(timekeeping_suspend_time, delta_delta);
+ }
}
timekeeping_update(tk, TK_MIRROR);
@@ -1327,7 +1482,7 @@ int timekeeping_suspend(void)
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+ tick_suspend();
clocksource_suspend();
clockevents_suspend();
@@ -1416,15 +1571,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*
* XXX - TODO: Doc ntp_error calculation.
*/
- if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
/* NTP adjustment caused clocksource mult overflow */
WARN_ON_ONCE(1);
return;
}
- tk->tkr.mult += mult_adj;
+ tk->tkr_mono.mult += mult_adj;
tk->xtime_interval += interval;
- tk->tkr.xtime_nsec -= offset;
+ tk->tkr_mono.xtime_nsec -= offset;
tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
}
@@ -1486,13 +1641,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
tk->ntp_err_mult = 0;
}
- if (unlikely(tk->tkr.clock->maxadj &&
- (abs(tk->tkr.mult - tk->tkr.clock->mult)
- > tk->tkr.clock->maxadj))) {
+ if (unlikely(tk->tkr_mono.clock->maxadj &&
+ (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
+ > tk->tkr_mono.clock->maxadj))) {
printk_once(KERN_WARNING
"Adjusting %s more than 11%% (%ld vs %ld)\n",
- tk->tkr.clock->name, (long)tk->tkr.mult,
- (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+ tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
+ (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
}
/*
@@ -1509,9 +1664,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
*/
- if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
- s64 neg = -(s64)tk->tkr.xtime_nsec;
- tk->tkr.xtime_nsec = 0;
+ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
+ s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
+ tk->tkr_mono.xtime_nsec = 0;
tk->ntp_error += neg << tk->ntp_error_shift;
}
}
@@ -1526,13 +1681,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
*/
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
- u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
+ u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
unsigned int clock_set = 0;
- while (tk->tkr.xtime_nsec >= nsecps) {
+ while (tk->tkr_mono.xtime_nsec >= nsecps) {
int leap;
- tk->tkr.xtime_nsec -= nsecps;
+ tk->tkr_mono.xtime_nsec -= nsecps;
tk->xtime_sec++;
/* Figure out if its a leap sec and apply if needed */
@@ -1577,9 +1732,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
/* Accumulate one shifted interval */
offset -= interval;
- tk->tkr.cycle_last += interval;
+ tk->tkr_mono.cycle_last += interval;
+ tk->tkr_raw.cycle_last += interval;
- tk->tkr.xtime_nsec += tk->xtime_interval << shift;
+ tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
@@ -1622,14 +1778,17 @@ void update_wall_time(void)
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
- tk->tkr.cycle_last, tk->tkr.mask);
+ offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval)
goto out;
+ /* Do some additional sanity checking */
+ timekeeping_check_update(real_tk, offset);
+
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* (think "ticks") worth of time at once. To do this efficiently,
@@ -1676,8 +1835,9 @@ void update_wall_time(void)
* memcpy under the tk_core.seq against one before we start
* updating.
*/
+ timekeeping_update(tk, clock_set);
memcpy(real_tk, tk, sizeof(*tk));
- timekeeping_update(real_tk, clock_set);
+ /* The memcpy must come last. Do not put anything here! */
write_seqcount_end(&tk_core.seq);
out:
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1721,7 +1881,7 @@ struct timespec __current_kernel_time(void)
return timespec64_to_timespec(tk_xtime(tk));
}
-struct timespec current_kernel_time(void)
+struct timespec64 current_kernel_time64(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now;
@@ -1733,9 +1893,9 @@ struct timespec current_kernel_time(void)
now = tk_xtime(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
- return timespec64_to_timespec(now);
+ return now;
}
-EXPORT_SYMBOL(current_kernel_time);
+EXPORT_SYMBOL(current_kernel_time64);
struct timespec64 get_monotonic_coarse64(void)
{
@@ -1766,47 +1926,20 @@ void do_timer(unsigned long ticks)
}
/**
- * ktime_get_update_offsets_tick - hrtimer helper
- * @offs_real: pointer to storage for monotonic -> realtime offset
- * @offs_boot: pointer to storage for monotonic -> boottime offset
- * @offs_tai: pointer to storage for monotonic -> clock tai offset
- *
- * Returns monotonic time at last tick and various offsets
- */
-ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
- ktime_t *offs_tai)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
- ktime_t base;
- u64 nsecs;
-
- do {
- seq = read_seqcount_begin(&tk_core.seq);
-
- base = tk->tkr.base_mono;
- nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
-
- *offs_real = tk->offs_real;
- *offs_boot = tk->offs_boot;
- *offs_tai = tk->offs_tai;
- } while (read_seqcount_retry(&tk_core.seq, seq));
-
- return ktime_add_ns(base, nsecs);
-}
-
-#ifdef CONFIG_HIGH_RES_TIMERS
-/**
* ktime_get_update_offsets_now - hrtimer helper
+ * @cwsseq: pointer to check and store the clock was set sequence number
* @offs_real: pointer to storage for monotonic -> realtime offset
* @offs_boot: pointer to storage for monotonic -> boottime offset
* @offs_tai: pointer to storage for monotonic -> clock tai offset
*
- * Returns current monotonic time and updates the offsets
+ * Returns current monotonic time and updates the offsets if the
+ * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
+ * different.
+ *
* Called from hrtimer_interrupt() or retrigger_next_event()
*/
-ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
- ktime_t *offs_tai)
+ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
+ ktime_t *offs_boot, ktime_t *offs_tai)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
@@ -1816,17 +1949,25 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->tkr.base_mono;
- nsecs = timekeeping_get_ns(&tk->tkr);
+ base = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ base = ktime_add_ns(base, nsecs);
+
+ if (*cwsseq != tk->clock_was_set_seq) {
+ *cwsseq = tk->clock_was_set_seq;
+ *offs_real = tk->offs_real;
+ *offs_boot = tk->offs_boot;
+ *offs_tai = tk->offs_tai;
+ }
+
+ /* Handle leapsecond insertion adjustments */
+ if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64))
+ *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
- *offs_real = tk->offs_real;
- *offs_boot = tk->offs_boot;
- *offs_tai = tk->offs_tai;
} while (read_seqcount_retry(&tk_core.seq, seq));
- return ktime_add_ns(base, nsecs);
+ return base;
}
-#endif
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
@@ -1867,6 +2008,8 @@ int do_adjtimex(struct timex *txc)
__timekeeping_set_tai_offset(tk, tai);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
}
+ tk_update_leap_state(tk);
+
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 1d91416055d5..704f595ce83f 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -3,20 +3,24 @@
/*
* Internal interfaces for kernel/time/
*/
-extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
- ktime_t *offs_boot,
- ktime_t *offs_tai);
-extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
- ktime_t *offs_boot,
- ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
+ ktime_t *offs_real,
+ ktime_t *offs_boot,
+ ktime_t *offs_tai);
extern int timekeeping_valid_for_hres(void);
extern u64 timekeeping_max_deferment(void);
extern int timekeeping_inject_offset(struct timespec *ts);
extern s32 timekeeping_get_tai_offset(void);
extern void timekeeping_set_tai_offset(s32 tai_offset);
-extern void timekeeping_clocktai(struct timespec *ts);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
+extern void do_timer(unsigned long ticks);
+extern void update_wall_time(void);
+
+extern seqlock_t jiffies_lock;
+
+#define CS_NAME_LEN 32
+
#endif
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..84190f02b521 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
#include <asm/timex.h>
#include <asm/io.h>
+#include "tick-internal.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>
@@ -68,11 +70,11 @@ EXPORT_SYMBOL(jiffies_64);
#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
struct tvec {
- struct list_head vec[TVN_SIZE];
+ struct hlist_head vec[TVN_SIZE];
};
struct tvec_root {
- struct list_head vec[TVR_SIZE];
+ struct hlist_head vec[TVR_SIZE];
};
struct tvec_base {
@@ -83,6 +85,8 @@ struct tvec_base {
unsigned long active_timers;
unsigned long all_timers;
int cpu;
+ bool migration_enabled;
+ bool nohz_active;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
@@ -90,33 +94,60 @@ struct tvec_base {
struct tvec tv5;
} ____cacheline_aligned;
-struct tvec_base boot_tvec_bases;
-EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
-/* Functions below help us manage 'deferrable' flag */
-static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
+static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+unsigned int sysctl_timer_migration = 1;
+
+void timers_update_migration(bool update_nohz)
{
- return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
+ bool on = sysctl_timer_migration && tick_nohz_active;
+ unsigned int cpu;
+
+ /* Avoid the loop, if nothing to update */
+ if (this_cpu_read(tvec_bases.migration_enabled) == on)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(tvec_bases.migration_enabled, cpu) = on;
+ per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
+ if (!update_nohz)
+ continue;
+ per_cpu(tvec_bases.nohz_active, cpu) = true;
+ per_cpu(hrtimer_bases.nohz_active, cpu) = true;
+ }
}
-static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
+int timer_migration_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
{
- return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
+ static DEFINE_MUTEX(mutex);
+ int ret;
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (!ret && write)
+ timers_update_migration(false);
+ mutex_unlock(&mutex);
+ return ret;
}
-static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+ int pinned)
{
- return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
+ if (pinned || !base->migration_enabled)
+ return this_cpu_ptr(&tvec_bases);
+ return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
}
-
-static inline void
-timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
+#else
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+ int pinned)
{
- unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
-
- timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
+ return this_cpu_ptr(&tvec_bases);
}
+#endif
static unsigned long round_jiffies_common(unsigned long j, int cpu,
bool force_up)
@@ -339,26 +370,12 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
}
EXPORT_SYMBOL_GPL(set_timer_slack);
-/*
- * If the list is empty, catch up ->timer_jiffies to the current time.
- * The caller must hold the tvec_base lock. Returns true if the list
- * was empty and therefore ->timer_jiffies was updated.
- */
-static bool catchup_timer_jiffies(struct tvec_base *base)
-{
- if (!base->all_timers) {
- base->timer_jiffies = jiffies;
- return true;
- }
- return false;
-}
-
static void
__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
unsigned long expires = timer->expires;
unsigned long idx = expires - base->timer_jiffies;
- struct list_head *vec;
+ struct hlist_head *vec;
if (idx < TVR_SIZE) {
int i = expires & TVR_MASK;
@@ -391,25 +408,25 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
vec = base->tv5.vec + i;
}
- /*
- * Timers are FIFO:
- */
- list_add_tail(&timer->entry, vec);
+
+ hlist_add_head(&timer->entry, vec);
}
static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
- (void)catchup_timer_jiffies(base);
+ /* Advance base->jiffies, if the base is empty */
+ if (!base->all_timers++)
+ base->timer_jiffies = jiffies;
+
__internal_add_timer(base, timer);
/*
* Update base->active_timers and base->next_timer
*/
- if (!tbase_get_deferrable(timer->base)) {
+ if (!(timer->flags & TIMER_DEFERRABLE)) {
if (!base->active_timers++ ||
time_before(timer->expires, base->next_timer))
base->next_timer = timer->expires;
}
- base->all_timers++;
/*
* Check whether the other CPU is in dynticks mode and needs
@@ -424,8 +441,11 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
* require special care against races with idle_cpu(), lets deal
* with that later.
*/
- if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
- wake_up_nohz_cpu(base->cpu);
+ if (base->nohz_active) {
+ if (!(timer->flags & TIMER_DEFERRABLE) ||
+ tick_nohz_full_cpu(base->cpu))
+ wake_up_nohz_cpu(base->cpu);
+ }
}
#ifdef CONFIG_TIMER_STATS
@@ -441,15 +461,12 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
static void timer_stats_account_timer(struct timer_list *timer)
{
- unsigned int flag = 0;
-
if (likely(!timer->start_site))
return;
- if (unlikely(tbase_get_deferrable(timer->base)))
- flag |= TIMER_STATS_FLAG_DEFERRABLE;
timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
- timer->function, timer->start_comm, flag);
+ timer->function, timer->start_comm,
+ timer->flags);
}
#else
@@ -506,8 +523,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
* statically initialized. We just make sure that it
* is tracked in the object tracker.
*/
- if (timer->entry.next == NULL &&
- timer->entry.prev == TIMER_ENTRY_STATIC) {
+ if (timer->entry.pprev == NULL &&
+ timer->entry.next == TIMER_ENTRY_STATIC) {
debug_object_init(timer, &timer_debug_descr);
debug_object_activate(timer, &timer_debug_descr);
return 0;
@@ -553,7 +570,7 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+ if (timer->entry.next == TIMER_ENTRY_STATIC) {
/*
* This is not really a fixup. The timer was
* statically initialized. We just make sure that it
@@ -638,7 +655,7 @@ static inline void
debug_activate(struct timer_list *timer, unsigned long expires)
{
debug_timer_activate(timer);
- trace_timer_start(timer, expires);
+ trace_timer_start(timer, expires, timer->flags);
}
static inline void debug_deactivate(struct timer_list *timer)
@@ -655,10 +672,8 @@ static inline void debug_assert_init(struct timer_list *timer)
static void do_init_timer(struct timer_list *timer, unsigned int flags,
const char *name, struct lock_class_key *key)
{
- struct tvec_base *base = raw_cpu_read(tvec_bases);
-
- timer->entry.next = NULL;
- timer->base = (void *)((unsigned long)base | flags);
+ timer->entry.pprev = NULL;
+ timer->flags = flags | raw_smp_processor_id();
timer->slack = -1;
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
@@ -689,24 +704,23 @@ EXPORT_SYMBOL(init_timer_key);
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
- struct list_head *entry = &timer->entry;
+ struct hlist_node *entry = &timer->entry;
debug_deactivate(timer);
- __list_del(entry->prev, entry->next);
+ __hlist_del(entry);
if (clear_pending)
- entry->next = NULL;
- entry->prev = LIST_POISON2;
+ entry->pprev = NULL;
+ entry->next = LIST_POISON2;
}
static inline void
detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
{
detach_timer(timer, true);
- if (!tbase_get_deferrable(timer->base))
+ if (!(timer->flags & TIMER_DEFERRABLE))
base->active_timers--;
base->all_timers--;
- (void)catchup_timer_jiffies(base);
}
static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -716,13 +730,14 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
return 0;
detach_timer(timer, clear_pending);
- if (!tbase_get_deferrable(timer->base)) {
+ if (!(timer->flags & TIMER_DEFERRABLE)) {
base->active_timers--;
if (timer->expires == base->next_timer)
base->next_timer = base->timer_jiffies;
}
- base->all_timers--;
- (void)catchup_timer_jiffies(base);
+ /* If this was the last timer, advance base->jiffies */
+ if (!--base->all_timers)
+ base->timer_jiffies = jiffies;
return 1;
}
@@ -734,24 +749,22 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
* So __run_timers/migrate_timers can safely modify all timers which could
* be found on ->tvX lists.
*
- * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * When the timer's base is locked and removed from the list, the
+ * TIMER_MIGRATING flag is set, FIXME
*/
static struct tvec_base *lock_timer_base(struct timer_list *timer,
unsigned long *flags)
__acquires(timer->base->lock)
{
- struct tvec_base *base;
-
for (;;) {
- struct tvec_base *prelock_base = timer->base;
- base = tbase_get_base(prelock_base);
- if (likely(base != NULL)) {
+ u32 tf = timer->flags;
+ struct tvec_base *base;
+
+ if (!(tf & TIMER_MIGRATING)) {
+ base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
spin_lock_irqsave(&base->lock, *flags);
- if (likely(prelock_base == timer->base))
+ if (timer->flags == tf)
return base;
- /* The timer has migrated to another CPU */
spin_unlock_irqrestore(&base->lock, *flags);
}
cpu_relax();
@@ -760,11 +773,11 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires,
- bool pending_only, int pinned)
+ bool pending_only, int pinned)
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret = 0 , cpu;
+ int ret = 0;
timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
@@ -777,8 +790,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
debug_activate(timer, expires);
- cpu = get_nohz_timer_target(pinned);
- new_base = per_cpu(tvec_bases, cpu);
+ new_base = get_target_base(base, pinned);
if (base != new_base) {
/*
@@ -790,11 +802,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
*/
if (likely(base->running_timer != timer)) {
/* See the comment in lock_timer_base() */
- timer_set_base(timer, NULL);
+ timer->flags |= TIMER_MIGRATING;
+
spin_unlock(&base->lock);
base = new_base;
spin_lock(&base->lock);
- timer_set_base(timer, base);
+ WRITE_ONCE(timer->flags,
+ (timer->flags & ~TIMER_BASEMASK) | base->cpu);
}
}
@@ -956,13 +970,13 @@ EXPORT_SYMBOL(add_timer);
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
- struct tvec_base *base = per_cpu(tvec_bases, cpu);
+ struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
unsigned long flags;
timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
spin_lock_irqsave(&base->lock, flags);
- timer_set_base(timer, base);
+ timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
spin_unlock_irqrestore(&base->lock, flags);
@@ -1081,7 +1095,7 @@ int del_timer_sync(struct timer_list *timer)
* don't use it in hardirq context, because it
* could lead to deadlock.
*/
- WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
+ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
for (;;) {
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
@@ -1095,17 +1109,17 @@ EXPORT_SYMBOL(del_timer_sync);
static int cascade(struct tvec_base *base, struct tvec *tv, int index)
{
/* cascade all the timers from tv up one level */
- struct timer_list *timer, *tmp;
- struct list_head tv_list;
+ struct timer_list *timer;
+ struct hlist_node *tmp;
+ struct hlist_head tv_list;
- list_replace_init(tv->vec + index, &tv_list);
+ hlist_move_list(tv->vec + index, &tv_list);
/*
* We are removing _all_ timers from the list, so we
* don't have to detach them individually.
*/
- list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
- BUG_ON(tbase_get_base(timer->base) != base);
+ hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
/* No accounting, while moving them */
__internal_add_timer(base, timer);
}
@@ -1170,14 +1184,18 @@ static inline void __run_timers(struct tvec_base *base)
struct timer_list *timer;
spin_lock_irq(&base->lock);
- if (catchup_timer_jiffies(base)) {
- spin_unlock_irq(&base->lock);
- return;
- }
+
while (time_after_eq(jiffies, base->timer_jiffies)) {
- struct list_head work_list;
- struct list_head *head = &work_list;
- int index = base->timer_jiffies & TVR_MASK;
+ struct hlist_head work_list;
+ struct hlist_head *head = &work_list;
+ int index;
+
+ if (!base->all_timers) {
+ base->timer_jiffies = jiffies;
+ break;
+ }
+
+ index = base->timer_jiffies & TVR_MASK;
/*
* Cascade timers:
@@ -1188,16 +1206,16 @@ static inline void __run_timers(struct tvec_base *base)
!cascade(base, &base->tv4, INDEX(2)))
cascade(base, &base->tv5, INDEX(3));
++base->timer_jiffies;
- list_replace_init(base->tv1.vec + index, head);
- while (!list_empty(head)) {
+ hlist_move_list(base->tv1.vec + index, head);
+ while (!hlist_empty(head)) {
void (*fn)(unsigned long);
unsigned long data;
bool irqsafe;
- timer = list_first_entry(head, struct timer_list,entry);
+ timer = hlist_entry(head->first, struct timer_list, entry);
fn = timer->function;
data = timer->data;
- irqsafe = tbase_get_irqsafe(timer->base);
+ irqsafe = timer->flags & TIMER_IRQSAFE;
timer_stats_account_timer(timer);
@@ -1236,8 +1254,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base)
/* Look for timer events in tv1. */
index = slot = timer_jiffies & TVR_MASK;
do {
- list_for_each_entry(nte, base->tv1.vec + slot, entry) {
- if (tbase_get_deferrable(nte->base))
+ hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
+ if (nte->flags & TIMER_DEFERRABLE)
continue;
found = 1;
@@ -1267,8 +1285,8 @@ cascade:
index = slot = timer_jiffies & TVN_MASK;
do {
- list_for_each_entry(nte, varp->vec + slot, entry) {
- if (tbase_get_deferrable(nte->base))
+ hlist_for_each_entry(nte, varp->vec + slot, entry) {
+ if (nte->flags & TIMER_DEFERRABLE)
continue;
found = 1;
@@ -1299,54 +1317,48 @@ cascade:
* Check, if the next hrtimer event is before the next timer wheel
* event:
*/
-static unsigned long cmp_next_hrtimer_event(unsigned long now,
- unsigned long expires)
+static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
- ktime_t hr_delta = hrtimer_get_next_event();
- struct timespec tsdelta;
- unsigned long delta;
-
- if (hr_delta.tv64 == KTIME_MAX)
- return expires;
+ u64 nextevt = hrtimer_get_next_event();
/*
- * Expired timer available, let it expire in the next tick
+ * If high resolution timers are enabled
+ * hrtimer_get_next_event() returns KTIME_MAX.
*/
- if (hr_delta.tv64 <= 0)
- return now + 1;
-
- tsdelta = ktime_to_timespec(hr_delta);
- delta = timespec_to_jiffies(&tsdelta);
+ if (expires <= nextevt)
+ return expires;
/*
- * Limit the delta to the max value, which is checked in
- * tick_nohz_stop_sched_tick():
+ * If the next timer is already expired, return the tick base
+ * time so the tick is fired immediately.
*/
- if (delta > NEXT_TIMER_MAX_DELTA)
- delta = NEXT_TIMER_MAX_DELTA;
+ if (nextevt <= basem)
+ return basem;
/*
- * Take rounding errors in to account and make sure, that it
- * expires in the next tick. Otherwise we go into an endless
- * ping pong due to tick_nohz_stop_sched_tick() retriggering
- * the timer softirq
+ * Round up to the next jiffie. High resolution timers are
+ * off, so the hrtimers are expired in the tick and we need to
+ * make sure that this tick really expires the timer to avoid
+ * a ping pong of the nohz stop code.
+ *
+ * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
*/
- if (delta < 1)
- delta = 1;
- now += delta;
- if (time_before(now, expires))
- return now;
- return expires;
+ return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}
/**
- * get_next_timer_interrupt - return the jiffy of the next pending timer
- * @now: current time (in jiffies)
+ * get_next_timer_interrupt - return the time (clock mono) of the next timer
+ * @basej: base time jiffies
+ * @basem: base time clock monotonic
+ *
+ * Returns the tick aligned clock monotonic time of the next pending
+ * timer or KTIME_MAX if no timer is pending.
*/
-unsigned long get_next_timer_interrupt(unsigned long now)
+u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
- struct tvec_base *base = __this_cpu_read(tvec_bases);
- unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
+ struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+ u64 expires = KTIME_MAX;
+ unsigned long nextevt;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1359,14 +1371,15 @@ unsigned long get_next_timer_interrupt(unsigned long now)
if (base->active_timers) {
if (time_before_eq(base->next_timer, base->timer_jiffies))
base->next_timer = __next_timer_interrupt(base);
- expires = base->next_timer;
+ nextevt = base->next_timer;
+ if (time_before_eq(nextevt, basej))
+ expires = basem;
+ else
+ expires = basem + (nextevt - basej) * TICK_NSEC;
}
spin_unlock(&base->lock);
- if (time_before_eq(expires, now))
- return now;
-
- return cmp_next_hrtimer_event(now, expires);
+ return cmp_next_hrtimer_event(basem, expires);
}
#endif
@@ -1395,9 +1408,7 @@ void update_process_times(int user_tick)
*/
static void run_timer_softirq(struct softirq_action *h)
{
- struct tvec_base *base = __this_cpu_read(tvec_bases);
-
- hrtimer_run_pending();
+ struct tvec_base *base = this_cpu_ptr(&tvec_bases);
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
@@ -1532,74 +1543,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-static int init_timers_cpu(int cpu)
-{
- int j;
- struct tvec_base *base;
- static char tvec_base_done[NR_CPUS];
-
- if (!tvec_base_done[cpu]) {
- static char boot_done;
-
- if (boot_done) {
- /*
- * The APs use this path later in boot
- */
- base = kzalloc_node(sizeof(*base), GFP_KERNEL,
- cpu_to_node(cpu));
- if (!base)
- return -ENOMEM;
-
- /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
- if (WARN_ON(base != tbase_get_base(base))) {
- kfree(base);
- return -ENOMEM;
- }
- per_cpu(tvec_bases, cpu) = base;
- } else {
- /*
- * This is for the boot CPU - we use compile-time
- * static initialisation because per-cpu memory isn't
- * ready yet and because the memory allocators are not
- * initialised either.
- */
- boot_done = 1;
- base = &boot_tvec_bases;
- }
- spin_lock_init(&base->lock);
- tvec_base_done[cpu] = 1;
- base->cpu = cpu;
- } else {
- base = per_cpu(tvec_bases, cpu);
- }
-
-
- for (j = 0; j < TVN_SIZE; j++) {
- INIT_LIST_HEAD(base->tv5.vec + j);
- INIT_LIST_HEAD(base->tv4.vec + j);
- INIT_LIST_HEAD(base->tv3.vec + j);
- INIT_LIST_HEAD(base->tv2.vec + j);
- }
- for (j = 0; j < TVR_SIZE; j++)
- INIT_LIST_HEAD(base->tv1.vec + j);
-
- base->timer_jiffies = jiffies;
- base->next_timer = base->timer_jiffies;
- base->active_timers = 0;
- base->all_timers = 0;
- return 0;
-}
-
#ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
+static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
{
struct timer_list *timer;
+ int cpu = new_base->cpu;
- while (!list_empty(head)) {
- timer = list_first_entry(head, struct timer_list, entry);
+ while (!hlist_empty(head)) {
+ timer = hlist_entry(head->first, struct timer_list, entry);
/* We ignore the accounting on the dying cpu */
detach_timer(timer, false);
- timer_set_base(timer, new_base);
+ timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
internal_add_timer(new_base, timer);
}
}
@@ -1611,8 +1565,8 @@ static void migrate_timers(int cpu)
int i;
BUG_ON(cpu_online(cpu));
- old_base = per_cpu(tvec_bases, cpu);
- new_base = get_cpu_var(tvec_bases);
+ old_base = per_cpu_ptr(&tvec_bases, cpu);
+ new_base = get_cpu_ptr(&tvec_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
@@ -1631,55 +1585,61 @@ static void migrate_timers(int cpu)
migrate_timer_list(new_base, old_base->tv5.vec + i);
}
+ old_base->active_timers = 0;
+ old_base->all_timers = 0;
+
spin_unlock(&old_base->lock);
spin_unlock_irq(&new_base->lock);
- put_cpu_var(tvec_bases);
+ put_cpu_ptr(&tvec_bases);
}
-#endif /* CONFIG_HOTPLUG_CPU */
static int timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
- long cpu = (long)hcpu;
- int err;
-
- switch(action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- err = init_timers_cpu(cpu);
- if (err < 0)
- return notifier_from_errno(err);
- break;
-#ifdef CONFIG_HOTPLUG_CPU
+ switch (action) {
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- migrate_timers(cpu);
+ migrate_timers((long)hcpu);
break;
-#endif
default:
break;
}
+
return NOTIFY_OK;
}
-static struct notifier_block timers_nb = {
- .notifier_call = timer_cpu_notify,
-};
-
+static inline void timer_register_cpu_notifier(void)
+{
+ cpu_notifier(timer_cpu_notify, 0);
+}
+#else
+static inline void timer_register_cpu_notifier(void) { }
+#endif /* CONFIG_HOTPLUG_CPU */
-void __init init_timers(void)
+static void __init init_timer_cpu(int cpu)
{
- int err;
+ struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
+
+ base->cpu = cpu;
+ spin_lock_init(&base->lock);
+
+ base->timer_jiffies = jiffies;
+ base->next_timer = base->timer_jiffies;
+}
- /* ensure there are enough low bits for flags in timer->base pointer */
- BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
+static void __init init_timer_cpus(void)
+{
+ int cpu;
- err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
- BUG_ON(err != NOTIFY_OK);
+ for_each_possible_cpu(cpu)
+ init_timer_cpu(cpu);
+}
+void __init init_timers(void)
+{
+ init_timer_cpus();
init_timer_stats();
- register_cpu_notifier(&timers_nb);
+ timer_register_cpu_notifier();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
@@ -1712,14 +1672,14 @@ unsigned long msleep_interruptible(unsigned int msecs)
EXPORT_SYMBOL(msleep_interruptible);
-static int __sched do_usleep_range(unsigned long min, unsigned long max)
+static void __sched do_usleep_range(unsigned long min, unsigned long max)
{
ktime_t kmin;
unsigned long delta;
kmin = ktime_set(0, min * NSEC_PER_USEC);
delta = (max - min) * NSEC_PER_USEC;
- return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+ schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
}
/**
@@ -1727,7 +1687,7 @@ static int __sched do_usleep_range(unsigned long min, unsigned long max)
* @min: Minimum time in usecs to sleep
* @max: Maximum time in usecs to sleep
*/
-void usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range(unsigned long min, unsigned long max)
{
__set_current_state(TASK_UNINTERRUPTIBLE);
do_usleep_range(min, max);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 61ed862cdd37..129c96033e46 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,10 +16,10 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
-#include <linux/tick.h>
#include <asm/uaccess.h>
+#include "tick-internal.h"
struct timer_list_iter {
int cpu;
@@ -29,19 +29,24 @@ struct timer_list_iter {
typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
-DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
-
/*
* This allows printing both to /proc/timer_list and
* to the console (on SysRq-Q):
*/
-#define SEQ_printf(m, x...) \
- do { \
- if (m) \
- seq_printf(m, x); \
- else \
- printk(x); \
- } while (0)
+__printf(2, 3)
+static void SEQ_printf(struct seq_file *m, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+
+ if (m)
+ seq_vprintf(m, fmt, args);
+ else
+ vprintk(fmt, args);
+
+ va_end(args);
+}
static void print_name_offset(struct seq_file *m, void *sym)
{
@@ -120,10 +125,10 @@ static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
SEQ_printf(m, " .base: %pK\n", base);
- SEQ_printf(m, " .index: %d\n",
- base->index);
- SEQ_printf(m, " .resolution: %Lu nsecs\n",
- (unsigned long long)ktime_to_ns(base->resolution));
+ SEQ_printf(m, " .index: %d\n", base->index);
+
+ SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution);
+
SEQ_printf(m, " .get_time: ");
print_name_offset(m, base->get_time);
SEQ_printf(m, "\n");
@@ -132,7 +137,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
(unsigned long long) ktime_to_ns(base->offset));
#endif
SEQ_printf(m, "active timers:\n");
- print_active_timers(m, base, now);
+ print_active_timers(m, base, now + ktime_to_ns(base->offset));
}
static void print_cpu(struct seq_file *m, int cpu, u64 now)
@@ -158,7 +163,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P(nr_events);
P(nr_retries);
P(nr_hangs);
- P_ns(max_hang_time);
+ P(max_hang_time);
#endif
#undef P
#undef P_ns
@@ -184,7 +189,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P_ns(idle_sleeptime);
P_ns(iowait_sleeptime);
P(last_jiffies);
- P(next_jiffies);
+ P(next_timer);
P_ns(idle_expires);
SEQ_printf(m, "jiffies: %Lu\n",
(unsigned long long)jiffies);
@@ -228,9 +233,41 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->set_next_event);
SEQ_printf(m, "\n");
- SEQ_printf(m, " set_mode: ");
- print_name_offset(m, dev->set_mode);
- SEQ_printf(m, "\n");
+ if (dev->set_mode) {
+ SEQ_printf(m, " set_mode: ");
+ print_name_offset(m, dev->set_mode);
+ SEQ_printf(m, "\n");
+ } else {
+ if (dev->set_state_shutdown) {
+ SEQ_printf(m, " shutdown: ");
+ print_name_offset(m, dev->set_state_shutdown);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->set_state_periodic) {
+ SEQ_printf(m, " periodic: ");
+ print_name_offset(m, dev->set_state_periodic);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->set_state_oneshot) {
+ SEQ_printf(m, " oneshot: ");
+ print_name_offset(m, dev->set_state_oneshot);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->set_state_oneshot_stopped) {
+ SEQ_printf(m, " oneshot stopped: ");
+ print_name_offset(m, dev->set_state_oneshot_stopped);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->tick_resume) {
+ SEQ_printf(m, " resume: ");
+ print_name_offset(m, dev->tick_resume);
+ SEQ_printf(m, "\n");
+ }
+ }
SEQ_printf(m, " event_handler: ");
print_name_offset(m, dev->event_handler);
@@ -243,11 +280,11 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
{
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
print_tickdevice(m, tick_get_broadcast_device(), -1);
- SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
- cpumask_bits(tick_get_broadcast_mask())[0]);
+ SEQ_printf(m, "tick_broadcast_mask: %*pb\n",
+ cpumask_pr_args(tick_get_broadcast_mask()));
#ifdef CONFIG_TICK_ONESHOT
- SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
- cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
+ SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n",
+ cpumask_pr_args(tick_get_broadcast_oneshot_mask()));
#endif
SEQ_printf(m, "\n");
#endif
@@ -256,7 +293,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
static inline void timer_list_header(struct seq_file *m, u64 now)
{
- SEQ_printf(m, "Timer List Version: v0.7\n");
+ SEQ_printf(m, "Timer List Version: v0.8\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1fb08f21302e..1adecb4b87c8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -68,7 +68,7 @@ struct entry {
* Number of timeout events:
*/
unsigned long count;
- unsigned int timer_flag;
+ u32 flags;
/*
* We save the command-line string to preserve
@@ -227,13 +227,13 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
* @startf: pointer to the function which did the timer setup
* @timerf: pointer to the timer callback function of the timer
* @comm: name of the process which set up the timer
+ * @tflags: The flags field of the timer
*
* When the timer is already registered, then the event counter is
* incremented. Otherwise the timer is registered in a free slot.
*/
void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
- void *timerf, char *comm,
- unsigned int timer_flag)
+ void *timerf, char *comm, u32 tflags)
{
/*
* It doesn't matter which lock we take:
@@ -251,7 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
input.start_func = startf;
input.expire_func = timerf;
input.pid = pid;
- input.timer_flag = timer_flag;
+ input.flags = tflags;
raw_spin_lock_irqsave(lock, flags);
if (!timer_stats_active)
@@ -306,7 +306,7 @@ static int tstats_show(struct seq_file *m, void *v)
for (i = 0; i < nr_entries; i++) {
entry = entries + i;
- if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+ if (entry->flags & TIMER_DEFERRABLE) {
seq_printf(m, "%4luD, %5d %-16s ",
entry->count, entry->pid, entry->comm);
} else {
diff --git a/kernel/torture.c b/kernel/torture.c
index dd70993c266c..3e4840633d3e 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -409,7 +409,7 @@ static void (*torture_shutdown_hook)(void);
*/
void torture_shutdown_absorb(const char *title)
{
- while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ while (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
pr_notice("torture thread %s parking due to system shutdown\n",
title);
schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -480,9 +480,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1,
unsigned long unused2, void *unused3)
{
mutex_lock(&fullstop_mutex);
- if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+ if (READ_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
- ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+ WRITE_ONCE(fullstop, FULLSTOP_SHUTDOWN);
} else {
pr_warn("Concurrent rmmod and shutdown illegal!\n");
}
@@ -523,13 +523,13 @@ static int stutter;
*/
void stutter_wait(const char *title)
{
- while (ACCESS_ONCE(stutter_pause_test) ||
- (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+ while (READ_ONCE(stutter_pause_test) ||
+ (torture_runnable && !READ_ONCE(*torture_runnable))) {
if (stutter_pause_test)
- if (ACCESS_ONCE(stutter_pause_test) == 1)
+ if (READ_ONCE(stutter_pause_test) == 1)
schedule_timeout_interruptible(1);
else
- while (ACCESS_ONCE(stutter_pause_test))
+ while (READ_ONCE(stutter_pause_test))
cond_resched();
else
schedule_timeout_interruptible(round_jiffies_relative(HZ));
@@ -549,14 +549,14 @@ static int torture_stutter(void *arg)
if (!torture_must_stop()) {
if (stutter > 1) {
schedule_timeout_interruptible(stutter - 1);
- ACCESS_ONCE(stutter_pause_test) = 2;
+ WRITE_ONCE(stutter_pause_test, 2);
}
schedule_timeout_interruptible(1);
- ACCESS_ONCE(stutter_pause_test) = 1;
+ WRITE_ONCE(stutter_pause_test, 1);
}
if (!torture_must_stop())
schedule_timeout_interruptible(stutter);
- ACCESS_ONCE(stutter_pause_test) = 0;
+ WRITE_ONCE(stutter_pause_test, 0);
torture_shutdown_absorb("torture_stutter");
} while (!torture_must_stop());
torture_kthread_stopping("torture_stutter");
@@ -642,13 +642,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
bool torture_cleanup_begin(void)
{
mutex_lock(&fullstop_mutex);
- if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ if (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
pr_warn("Concurrent rmmod and shutdown illegal!\n");
mutex_unlock(&fullstop_mutex);
schedule_timeout_uninterruptible(10);
return true;
}
- ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+ WRITE_ONCE(fullstop, FULLSTOP_RMMOD);
mutex_unlock(&fullstop_mutex);
torture_shutdown_cleanup();
torture_shuffle_cleanup();
@@ -681,7 +681,7 @@ EXPORT_SYMBOL_GPL(torture_must_stop);
*/
bool torture_must_stop_irq(void)
{
- return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+ return READ_ONCE(fullstop) != FULLSTOP_DONTSTOP;
}
EXPORT_SYMBOL_GPL(torture_must_stop_irq);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..1153c43428f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -432,6 +432,14 @@ config UPROBE_EVENT
This option is required if you plan to use perf-probe subcommand
of perf tools on user space applications.
+config BPF_EVENTS
+ depends on BPF_SYSCALL
+ depends on KPROBE_EVENT || UPROBE_EVENT
+ bool
+ default y
+ help
+ This allows the user to attach BPF programs to kprobe events.
+
config PROBE_EVENTS
def_bool n
@@ -599,6 +607,34 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
+config TRACE_ENUM_MAP_FILE
+ bool "Show enum mappings for trace events"
+ depends on TRACING
+ help
+ The "print fmt" of the trace events will show the enum names instead
+ of their values. This can cause problems for user space tools that
+ use this string to parse the raw data as user space does not know
+ how to convert the string to its value.
+
+ To fix this, there's a special macro in the kernel that can be used
+ to convert the enum into its value. If this macro is used, then the
+ print fmt strings will have the enums converted to their values.
+
+ If something does not get converted properly, this option can be
+ used to show what enums the kernel tried to convert.
+
+ This option is for debugging the enum conversions. A file is created
+ in the tracing directory called "enum_map" that will show the enum
+ names matched with their values and what trace event system they
+ belong too.
+
+ Normally, the mapping of the strings to values will be freed after
+ boot up or module load. With this option, they will not be freed, as
+ they are needed for the "enum_map" file. Enabling this option will
+ increase the memory footprint of the running kernel.
+
+ If unsure, say N
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..9b1044e936a6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 483cecfa5c17..90e72a0c3047 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -439,7 +439,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
{
struct blk_trace *old_bt, *bt = NULL;
struct dentry *dir = NULL;
- int ret, i;
+ int ret;
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
@@ -451,9 +451,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
* some device names have larger paths - convert the slashes
* to underscores for this to work as expected
*/
- for (i = 0; i < strlen(buts->name); i++)
- if (buts->name[i] == '/')
- buts->name[i] = '_';
+ strreplace(buts->name, '/', '_');
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
@@ -780,9 +778,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
if (likely(!bt))
return;
- if (!error && !bio_flagged(bio, BIO_UPTODATE))
- error = EIO;
-
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio->bi_rw, what, error, 0, NULL);
}
@@ -889,8 +884,7 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
- !bio_flagged(bio, BIO_UPTODATE),
- sizeof(rpdu), &rpdu);
+ bio->bi_error, sizeof(rpdu), &rpdu);
}
}
@@ -922,8 +916,8 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, BLK_TA_REMAP,
- !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+ bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+ sizeof(r), &r);
}
/**
@@ -1450,14 +1444,14 @@ static struct trace_event trace_blk_event = {
static int __init init_blk_tracer(void)
{
- if (!register_ftrace_event(&trace_blk_event)) {
+ if (!register_trace_event(&trace_blk_event)) {
pr_warning("Warning: could not register block events\n");
return 1;
}
if (register_tracer(&blk_tracer) != 0) {
pr_warning("Warning: could not register the block tracer\n");
- unregister_ftrace_event(&trace_blk_event);
+ unregister_trace_event(&trace_blk_event);
return 1;
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..0fe96c7c8803
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,283 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
+#include "trace.h"
+
+static DEFINE_PER_CPU(int, bpf_prog_active);
+
+/**
+ * trace_call_bpf - invoke BPF program
+ * @prog: BPF program
+ * @ctx: opaque context pointer
+ *
+ * kprobe handlers execute BPF programs via this helper.
+ * Can be used from static tracepoints in the future.
+ *
+ * Return: BPF programs always return an integer which is interpreted by
+ * kprobe handler as:
+ * 0 - return from kprobe (event is filtered out)
+ * 1 - store kprobe event into ring buffer
+ * Other values are reserved and currently alias to 1
+ */
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+ unsigned int ret;
+
+ if (in_nmi()) /* not supported yet */
+ return 1;
+
+ preempt_disable();
+
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ /*
+ * since some bpf program is already running on this cpu,
+ * don't call into another bpf program (same or different)
+ * and don't send kprobe event into ring-buffer,
+ * so return zero here
+ */
+ ret = 0;
+ goto out;
+ }
+
+ rcu_read_lock();
+ ret = BPF_PROG_RUN(prog, ctx);
+ rcu_read_unlock();
+
+ out:
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *dst = (void *) (long) r1;
+ int size = (int) r2;
+ void *unsafe_ptr = (void *) (long) r3;
+
+ return probe_kernel_read(dst, unsafe_ptr, size);
+}
+
+static const struct bpf_func_proto bpf_probe_read_proto = {
+ .func = bpf_probe_read,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg3_type = ARG_ANYTHING,
+};
+
+/*
+ * limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+ char *fmt = (char *) (long) r1;
+ bool str_seen = false;
+ int mod[3] = {};
+ int fmt_cnt = 0;
+ u64 unsafe_addr;
+ char buf[64];
+ int i;
+
+ /*
+ * bpf_check()->check_func_arg()->check_stack_boundary()
+ * guarantees that fmt points to bpf program stack,
+ * fmt_size bytes of it were initialized and fmt_size > 0
+ */
+ if (fmt[--fmt_size] != 0)
+ return -EINVAL;
+
+ /* check format string for allowed specifiers */
+ for (i = 0; i < fmt_size; i++) {
+ if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+ return -EINVAL;
+
+ if (fmt[i] != '%')
+ continue;
+
+ if (fmt_cnt >= 3)
+ return -EINVAL;
+
+ /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+ i++;
+ if (fmt[i] == 'l') {
+ mod[fmt_cnt]++;
+ i++;
+ } else if (fmt[i] == 'p' || fmt[i] == 's') {
+ mod[fmt_cnt]++;
+ i++;
+ if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+ return -EINVAL;
+ fmt_cnt++;
+ if (fmt[i - 1] == 's') {
+ if (str_seen)
+ /* allow only one '%s' per fmt string */
+ return -EINVAL;
+ str_seen = true;
+
+ switch (fmt_cnt) {
+ case 1:
+ unsafe_addr = r3;
+ r3 = (long) buf;
+ break;
+ case 2:
+ unsafe_addr = r4;
+ r4 = (long) buf;
+ break;
+ case 3:
+ unsafe_addr = r5;
+ r5 = (long) buf;
+ break;
+ }
+ buf[0] = 0;
+ strncpy_from_unsafe(buf,
+ (void *) (long) unsafe_addr,
+ sizeof(buf));
+ }
+ continue;
+ }
+
+ if (fmt[i] == 'l') {
+ mod[fmt_cnt]++;
+ i++;
+ }
+
+ if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+ return -EINVAL;
+ fmt_cnt++;
+ }
+
+ return __trace_printk(1/* fake ip will not be printed */, fmt,
+ mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
+ mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
+ mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+}
+
+static const struct bpf_func_proto bpf_trace_printk_proto = {
+ .func = bpf_trace_printk,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+};
+
+const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+{
+ /*
+ * this program might be calling bpf_trace_printk,
+ * so allocate per-cpu printk buffers
+ */
+ trace_printk_init_buffers();
+
+ return &bpf_trace_printk_proto;
+}
+
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+
+ if (unlikely(index >= array->map.max_entries))
+ return -E2BIG;
+
+ event = (struct perf_event *)array->ptrs[index];
+ if (!event)
+ return -ENOENT;
+
+ /*
+ * we don't know if the function is run successfully by the
+ * return value. It can be judged in other places, such as
+ * eBPF programs.
+ */
+ return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+ .func = bpf_perf_event_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_probe_read:
+ return &bpf_probe_read_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_tail_call:
+ return &bpf_tail_call_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+ case BPF_FUNC_trace_printk:
+ return bpf_get_trace_printk_proto();
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
+ default:
+ return NULL;
+ }
+}
+
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct pt_regs))
+ return false;
+
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+
+static struct bpf_verifier_ops kprobe_prog_ops = {
+ .get_func_proto = kprobe_prog_func_proto,
+ .is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+ .ops = &kprobe_prog_ops,
+ .type = BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+ bpf_register_prog_type(&kprobe_tl);
+ return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f228024055b..eb11011b5292 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,7 +18,7 @@
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/suspend.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/hardirq.h>
#include <linux/kthread.h>
#include <linux/uaccess.h>
@@ -98,6 +98,13 @@ struct ftrace_pid {
struct pid *pid;
};
+static bool ftrace_pids_enabled(void)
+{
+ return !list_empty(&ftrace_pids);
+}
+
+static void ftrace_update_trampoline(struct ftrace_ops *ops);
+
/*
* ftrace_disabled is set when an anomaly is discovered.
* ftrace_disabled is much stronger than ftrace_enabled.
@@ -109,7 +116,6 @@ static DEFINE_MUTEX(ftrace_lock);
static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
-ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
static struct ftrace_ops control_ops;
@@ -183,14 +189,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
if (!test_tsk_trace_trace(current))
return;
- ftrace_pid_function(ip, parent_ip, op, regs);
-}
-
-static void set_ftrace_pid_function(ftrace_func_t func)
-{
- /* do not set ftrace_pid_function to itself! */
- if (func != ftrace_pid_func)
- ftrace_pid_function = func;
+ op->saved_func(ip, parent_ip, op, regs);
}
/**
@@ -202,7 +201,6 @@ static void set_ftrace_pid_function(ftrace_func_t func)
void clear_ftrace_function(void)
{
ftrace_trace_function = ftrace_stub;
- ftrace_pid_function = ftrace_stub;
}
static void control_ops_disable_all(struct ftrace_ops *ops)
@@ -249,6 +247,19 @@ static void update_function_graph_func(void);
static inline void update_function_graph_func(void) { }
#endif
+
+static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
+{
+ /*
+ * If this is a dynamic ops or we force list func,
+ * then it needs to call the list anyway.
+ */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ return ftrace_ops_list_func;
+
+ return ftrace_ops_get_func(ops);
+}
+
static void update_ftrace_function(void)
{
ftrace_func_t func;
@@ -270,7 +281,7 @@ static void update_ftrace_function(void)
* then have the mcount trampoline call the function directly.
*/
} else if (ftrace_ops_list->next == &ftrace_list_end) {
- func = ftrace_ops_get_func(ftrace_ops_list);
+ func = ftrace_ops_get_list_func(ftrace_ops_list);
} else {
/* Just use the default ftrace_ops */
@@ -423,6 +434,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
} else
add_ftrace_ops(&ftrace_ops_list, ops);
+ /* Always save the function, and reset at unregistering */
+ ops->saved_func = ops->func;
+
+ if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+ ops->func = ftrace_pid_func;
+
ftrace_update_trampoline(ops);
if (ftrace_enabled)
@@ -450,15 +467,28 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (ftrace_enabled)
update_ftrace_function();
+ ops->func = ops->saved_func;
+
return 0;
}
static void ftrace_update_pid_func(void)
{
+ bool enabled = ftrace_pids_enabled();
+ struct ftrace_ops *op;
+
/* Only do something if we are tracing something */
if (ftrace_trace_function == ftrace_stub)
return;
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op->flags & FTRACE_OPS_FL_PID) {
+ op->func = enabled ? ftrace_pid_func :
+ op->saved_func;
+ ftrace_update_trampoline(op);
+ }
+ } while_for_each_ftrace_op(op);
+
update_ftrace_function();
}
@@ -1008,7 +1038,7 @@ static struct tracer_stat function_stats __initdata = {
.stat_show = function_stat_show
};
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
struct ftrace_profile_stat *stat;
struct dentry *entry;
@@ -1044,15 +1074,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
}
}
- entry = debugfs_create_file("function_profile_enabled", 0644,
+ entry = tracefs_create_file("function_profile_enabled", 0644,
d_tracer, NULL, &ftrace_profile_fops);
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'function_profile_enabled' entry\n");
}
#else /* CONFIG_FUNCTION_PROFILER */
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
}
#endif /* CONFIG_FUNCTION_PROFILER */
@@ -1120,7 +1150,8 @@ static struct ftrace_ops global_ops = {
.local_hash.filter_hash = EMPTY_HASH,
INIT_OPS_HASH(global_ops)
.flags = FTRACE_OPS_FL_RECURSION_SAFE |
- FTRACE_OPS_FL_INITIALIZED,
+ FTRACE_OPS_FL_INITIALIZED |
+ FTRACE_OPS_FL_PID,
};
/*
@@ -4712,7 +4743,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
mutex_unlock(&ftrace_lock);
}
-static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
+static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{
trace_create_file("available_filter_functions", 0444,
@@ -5010,7 +5041,9 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
static struct ftrace_ops global_ops = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_INITIALIZED |
+ FTRACE_OPS_FL_PID,
};
static int __init ftrace_nodyn_init(void)
@@ -5020,7 +5053,7 @@ static int __init ftrace_nodyn_init(void)
}
core_initcall(ftrace_nodyn_init);
-static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
static inline void ftrace_startup_all(int command) { }
/* Keep as macros so we do not need to define the commands */
@@ -5067,11 +5100,6 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
if (WARN_ON(tr->ops->func != ftrace_stub))
printk("ftrace ops had %pS for function\n",
tr->ops->func);
- /* Only the top level instance does pid tracing */
- if (!list_empty(&ftrace_pids)) {
- set_ftrace_pid_function(func);
- func = ftrace_pid_func;
- }
}
tr->ops->func = func;
tr->ops->private = tr;
@@ -5209,13 +5237,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
/*
- * If this is a dynamic ops or we force list func,
- * then it needs to call the list anyway.
- */
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
- return ftrace_ops_list_func;
-
- /*
* If the func handles its own recursion, call it directly.
* Otherwise call the recursion protected function that
* will call the ftrace ops function.
@@ -5365,7 +5386,7 @@ static void *fpid_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&ftrace_lock);
- if (list_empty(&ftrace_pids) && (!*pos))
+ if (!ftrace_pids_enabled() && (!*pos))
return (void *) 1;
return seq_list_start(&ftrace_pids, *pos);
@@ -5473,7 +5494,7 @@ static const struct file_operations ftrace_pid_fops = {
.release = ftrace_pid_release,
};
-static __init int ftrace_init_debugfs(void)
+static __init int ftrace_init_tracefs(void)
{
struct dentry *d_tracer;
@@ -5481,16 +5502,16 @@ static __init int ftrace_init_debugfs(void)
if (IS_ERR(d_tracer))
return 0;
- ftrace_init_dyn_debugfs(d_tracer);
+ ftrace_init_dyn_tracefs(d_tracer);
trace_create_file("set_ftrace_pid", 0644, d_tracer,
NULL, &ftrace_pid_fops);
- ftrace_profile_debugfs(d_tracer);
+ ftrace_profile_tracefs(d_tracer);
return 0;
}
-fs_initcall(ftrace_init_debugfs);
+fs_initcall(ftrace_init_tracefs);
/**
* ftrace_kill - kill ftrace
@@ -5604,6 +5625,7 @@ static struct ftrace_ops graph_ops = {
.func = ftrace_stub,
.flags = FTRACE_OPS_FL_RECURSION_SAFE |
FTRACE_OPS_FL_INITIALIZED |
+ FTRACE_OPS_FL_PID |
FTRACE_OPS_FL_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5040d44fe5a3..6260717c18e3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
#include <linux/trace_seq.h>
@@ -115,63 +115,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
*
*/
-/*
- * A fast way to enable or disable all ring buffers is to
- * call tracing_on or tracing_off. Turning off the ring buffers
- * prevents all ring buffers from being recorded to.
- * Turning this switch on, makes it OK to write to the
- * ring buffer, if the ring buffer is enabled itself.
- *
- * There's three layers that must be on in order to write
- * to the ring buffer.
- *
- * 1) This global flag must be set.
- * 2) The ring buffer must be enabled for recording.
- * 3) The per cpu buffer must be enabled for recording.
- *
- * In case of an anomaly, this global flag has a bit set that
- * will permantly disable all ring buffers.
- */
-
-/*
- * Global flag to disable all recording to ring buffers
- * This has two bits: ON, DISABLED
- *
- * ON DISABLED
- * ---- ----------
- * 0 0 : ring buffers are off
- * 1 0 : ring buffers are on
- * X 1 : ring buffers are permanently disabled
- */
-
-enum {
- RB_BUFFERS_ON_BIT = 0,
- RB_BUFFERS_DISABLED_BIT = 1,
-};
-
-enum {
- RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
- RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
-};
-
-static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
-
/* Used for individual buffers (after the counter) */
#define RB_BUFFER_OFF (1 << 20)
#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
-/**
- * tracing_off_permanent - permanently disable ring buffers
- *
- * This function, once called, will disable all ring buffers
- * permanently.
- */
-void tracing_off_permanent(void)
-{
- set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
-}
-
#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -452,6 +400,23 @@ struct rb_irq_work {
};
/*
+ * Used for which event context the event is in.
+ * NMI = 0
+ * IRQ = 1
+ * SOFTIRQ = 2
+ * NORMAL = 3
+ *
+ * See trace_recursive_lock() comment below for more details.
+ */
+enum {
+ RB_CTX_NMI,
+ RB_CTX_IRQ,
+ RB_CTX_SOFTIRQ,
+ RB_CTX_NORMAL,
+ RB_CTX_MAX
+};
+
+/*
* head_page == tail_page && head == tail then buffer is empty.
*/
struct ring_buffer_per_cpu {
@@ -462,6 +427,7 @@ struct ring_buffer_per_cpu {
arch_spinlock_t lock;
struct lock_class_key lock_key;
unsigned int nr_pages;
+ unsigned int current_context;
struct list_head *pages;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
@@ -2224,7 +2190,7 @@ static unsigned rb_calculate_event_length(unsigned length)
/* zero length can cause confusions */
if (!length)
- length = 1;
+ length++;
if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
length += sizeof(event.array[0]);
@@ -2636,8 +2602,6 @@ rb_reserve_next_event(struct ring_buffer *buffer,
return NULL;
}
-#ifdef CONFIG_TRACING
-
/*
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
@@ -2675,48 +2639,38 @@ rb_reserve_next_event(struct ring_buffer *buffer,
* just so happens that it is the same bit corresponding to
* the current context.
*/
-static DEFINE_PER_CPU(unsigned int, current_context);
-static __always_inline int trace_recursive_lock(void)
+static __always_inline int
+trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned int val = this_cpu_read(current_context);
+ unsigned int val = cpu_buffer->current_context;
int bit;
if (in_interrupt()) {
if (in_nmi())
- bit = 0;
+ bit = RB_CTX_NMI;
else if (in_irq())
- bit = 1;
+ bit = RB_CTX_IRQ;
else
- bit = 2;
+ bit = RB_CTX_SOFTIRQ;
} else
- bit = 3;
+ bit = RB_CTX_NORMAL;
if (unlikely(val & (1 << bit)))
return 1;
val |= (1 << bit);
- this_cpu_write(current_context, val);
+ cpu_buffer->current_context = val;
return 0;
}
-static __always_inline void trace_recursive_unlock(void)
+static __always_inline void
+trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned int val = this_cpu_read(current_context);
-
- val--;
- val &= this_cpu_read(current_context);
- this_cpu_write(current_context, val);
+ cpu_buffer->current_context &= cpu_buffer->current_context - 1;
}
-#else
-
-#define trace_recursive_lock() (0)
-#define trace_recursive_unlock() do { } while (0)
-
-#endif
-
/**
* ring_buffer_lock_reserve - reserve a part of the buffer
* @buffer: the ring buffer to reserve from
@@ -2739,41 +2693,37 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
struct ring_buffer_event *event;
int cpu;
- if (ring_buffer_flags != RB_BUFFERS_ON)
- return NULL;
-
/* If we are tracing schedule, we don't want to recurse */
preempt_disable_notrace();
- if (atomic_read(&buffer->record_disabled))
- goto out_nocheck;
-
- if (trace_recursive_lock())
- goto out_nocheck;
+ if (unlikely(atomic_read(&buffer->record_disabled)))
+ goto out;
cpu = raw_smp_processor_id();
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask)))
goto out;
cpu_buffer = buffer->buffers[cpu];
- if (atomic_read(&cpu_buffer->record_disabled))
+ if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
goto out;
- if (length > BUF_MAX_DATA_SIZE)
+ if (unlikely(length > BUF_MAX_DATA_SIZE))
+ goto out;
+
+ if (unlikely(trace_recursive_lock(cpu_buffer)))
goto out;
event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
- goto out;
+ goto out_unlock;
return event;
+ out_unlock:
+ trace_recursive_unlock(cpu_buffer);
out:
- trace_recursive_unlock();
-
- out_nocheck:
preempt_enable_notrace();
return NULL;
}
@@ -2863,7 +2813,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
rb_wakeups(buffer, cpu_buffer);
- trace_recursive_unlock();
+ trace_recursive_unlock(cpu_buffer);
preempt_enable_notrace();
@@ -2974,7 +2924,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
out:
rb_end_commit(cpu_buffer);
- trace_recursive_unlock();
+ trace_recursive_unlock(cpu_buffer);
preempt_enable_notrace();
@@ -3004,9 +2954,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
int ret = -EBUSY;
int cpu;
- if (ring_buffer_flags != RB_BUFFERS_ON)
- return -EBUSY;
-
preempt_disable_notrace();
if (atomic_read(&buffer->record_disabled))
@@ -3025,9 +2972,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
if (length > BUF_MAX_DATA_SIZE)
goto out;
+ if (unlikely(trace_recursive_lock(cpu_buffer)))
+ goto out;
+
event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
- goto out;
+ goto out_unlock;
body = rb_event_data(event);
@@ -3038,6 +2988,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
rb_wakeups(buffer, cpu_buffer);
ret = 0;
+
+ out_unlock:
+ trace_recursive_unlock(cpu_buffer);
+
out:
preempt_enable_notrace();
@@ -3864,19 +3818,36 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
-static inline int rb_ok_to_lock(void)
+static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
+ if (likely(!in_nmi())) {
+ raw_spin_lock(&cpu_buffer->reader_lock);
+ return true;
+ }
+
/*
* If an NMI die dumps out the content of the ring buffer
- * do not grab locks. We also permanently disable the ring
- * buffer too. A one time deal is all you get from reading
- * the ring buffer from an NMI.
+ * trylock must be used to prevent a deadlock if the NMI
+ * preempted a task that holds the ring buffer locks. If
+ * we get the lock then all is fine, if not, then continue
+ * to do the read, but this can corrupt the ring buffer,
+ * so it must be permanently disabled from future writes.
+ * Reading from NMI is a oneshot deal.
*/
- if (likely(!in_nmi()))
- return 1;
+ if (raw_spin_trylock(&cpu_buffer->reader_lock))
+ return true;
- tracing_off_permanent();
- return 0;
+ /* Continue without locking, but disable the ring buffer */
+ atomic_inc(&cpu_buffer->record_disabled);
+ return false;
+}
+
+static inline void
+rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked)
+{
+ if (likely(locked))
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+ return;
}
/**
@@ -3896,21 +3867,18 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
unsigned long flags;
- int dolock;
+ bool dolock;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
- dolock = rb_ok_to_lock();
again:
local_irq_save(flags);
- if (dolock)
- raw_spin_lock(&cpu_buffer->reader_lock);
+ dolock = rb_reader_lock(cpu_buffer);
event = rb_buffer_peek(cpu_buffer, ts, lost_events);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
rb_advance_reader(cpu_buffer);
- if (dolock)
- raw_spin_unlock(&cpu_buffer->reader_lock);
+ rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3963,9 +3931,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event = NULL;
unsigned long flags;
- int dolock;
-
- dolock = rb_ok_to_lock();
+ bool dolock;
again:
/* might be called in atomic */
@@ -3976,8 +3942,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
- if (dolock)
- raw_spin_lock(&cpu_buffer->reader_lock);
+ dolock = rb_reader_lock(cpu_buffer);
event = rb_buffer_peek(cpu_buffer, ts, lost_events);
if (event) {
@@ -3985,8 +3950,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
rb_advance_reader(cpu_buffer);
}
- if (dolock)
- raw_spin_unlock(&cpu_buffer->reader_lock);
+ rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
out:
@@ -4267,21 +4231,17 @@ int ring_buffer_empty(struct ring_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
- int dolock;
+ bool dolock;
int cpu;
int ret;
- dolock = rb_ok_to_lock();
-
/* yes this is racy, but if you don't like the race, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
- if (dolock)
- raw_spin_lock(&cpu_buffer->reader_lock);
+ dolock = rb_reader_lock(cpu_buffer);
ret = rb_per_cpu_empty(cpu_buffer);
- if (dolock)
- raw_spin_unlock(&cpu_buffer->reader_lock);
+ rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
if (!ret)
@@ -4301,21 +4261,17 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
- int dolock;
+ bool dolock;
int ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 1;
- dolock = rb_ok_to_lock();
-
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
- if (dolock)
- raw_spin_lock(&cpu_buffer->reader_lock);
+ dolock = rb_reader_lock(cpu_buffer);
ret = rb_per_cpu_empty(cpu_buffer);
- if (dolock)
- raw_spin_unlock(&cpu_buffer->reader_lock);
+ rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
return ret;
@@ -4353,9 +4309,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
ret = -EAGAIN;
- if (ring_buffer_flags != RB_BUFFERS_ON)
- goto out;
-
if (atomic_read(&buffer_a->record_disabled))
goto out;
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 13d945c0d03f..a1503a027ee2 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -32,11 +32,11 @@ static struct task_struct *producer;
static struct task_struct *consumer;
static unsigned long read;
-static int disable_reader;
+static unsigned int disable_reader;
module_param(disable_reader, uint, 0644);
MODULE_PARM_DESC(disable_reader, "only run producer");
-static int write_iteration = 50;
+static unsigned int write_iteration = 50;
module_param(write_iteration, uint, 0644);
MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
@@ -46,16 +46,16 @@ static int consumer_nice = MAX_NICE;
static int producer_fifo = -1;
static int consumer_fifo = -1;
-module_param(producer_nice, uint, 0644);
+module_param(producer_nice, int, 0644);
MODULE_PARM_DESC(producer_nice, "nice prio for producer");
-module_param(consumer_nice, uint, 0644);
+module_param(consumer_nice, int, 0644);
MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
-module_param(producer_fifo, uint, 0644);
+module_param(producer_fifo, int, 0644);
MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
-module_param(consumer_fifo, uint, 0644);
+module_param(consumer_fifo, int, 0644);
MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
static int read_events;
@@ -263,6 +263,8 @@ static void ring_buffer_producer(void)
if (cnt % wakeup_interval)
cond_resched();
#endif
+ if (kthread_should_stop())
+ kill_test = 1;
} while (ktime_before(end_time, timeout) && !kill_test);
trace_printk("End ring buffer hammer\n");
@@ -285,7 +287,7 @@ static void ring_buffer_producer(void)
entries = ring_buffer_entries(buffer);
overruns = ring_buffer_overruns(buffer);
- if (kill_test)
+ if (kill_test && !kthread_should_stop())
trace_printk("ERROR!\n");
if (!disable_reader) {
@@ -379,7 +381,7 @@ static int ring_buffer_consumer_thread(void *arg)
}
__set_current_state(TASK_RUNNING);
- if (kill_test)
+ if (!kthread_should_stop())
wait_to_die();
return 0;
@@ -399,13 +401,16 @@ static int ring_buffer_producer_thread(void *arg)
}
ring_buffer_producer();
+ if (kill_test)
+ goto out_kill;
trace_printk("Sleeping for 10 secs\n");
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ * SLEEP_TIME);
}
- if (kill_test)
+out_kill:
+ if (!kthread_should_stop())
wait_to_die();
return 0;
@@ -450,7 +455,7 @@ static int __init ring_buffer_benchmark_init(void)
if (producer_fifo >= 0) {
struct sched_param param = {
- .sched_priority = consumer_fifo
+ .sched_priority = producer_fifo
};
sched_setscheduler(producer, SCHED_FIFO, &param);
} else
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62c6506d663f..abcbf7ff8743 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,6 +20,7 @@
#include <linux/notifier.h>
#include <linux/irqflags.h>
#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
#include <linux/linkage.h>
@@ -31,6 +32,7 @@
#include <linux/splice.h>
#include <linux/kdebug.h>
#include <linux/string.h>
+#include <linux/mount.h>
#include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/ctype.h>
@@ -123,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+/* Map of enums to their values, for "enum_map" file */
+struct trace_enum_map_head {
+ struct module *mod;
+ unsigned long length;
+};
+
+union trace_enum_map_item;
+
+struct trace_enum_map_tail {
+ /*
+ * "end" is first and points to NULL as it must be different
+ * than "mod" or "enum_string"
+ */
+ union trace_enum_map_item *next;
+ const char *end; /* points to NULL */
+};
+
+static DEFINE_MUTEX(trace_enum_mutex);
+
+/*
+ * The trace_enum_maps are saved in an array with two extra elements,
+ * one at the beginning, and one at the end. The beginning item contains
+ * the count of the saved maps (head.length), and the module they
+ * belong to if not built in (head.mod). The ending item contains a
+ * pointer to the next array of saved enum_map items.
+ */
+union trace_enum_map_item {
+ struct trace_enum_map map;
+ struct trace_enum_map_head head;
+ struct trace_enum_map_tail tail;
+};
+
+static union trace_enum_map_item *trace_enum_maps;
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+
static int tracing_set_tracer(struct trace_array *tr, const char *buf);
#define MAX_TRACER_SIZE 100
@@ -259,11 +297,11 @@ void trace_array_put(struct trace_array *this_tr)
mutex_unlock(&trace_types_lock);
}
-int filter_check_discard(struct ftrace_event_file *file, void *rec,
+int filter_check_discard(struct trace_event_file *file, void *rec,
struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
- if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) &&
+ if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
!filter_match_preds(file->filter, rec)) {
ring_buffer_discard_commit(buffer, event);
return 1;
@@ -273,7 +311,7 @@ int filter_check_discard(struct ftrace_event_file *file, void *rec,
}
EXPORT_SYMBOL_GPL(filter_check_discard);
-int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
+int call_filter_check_discard(struct trace_event_call *call, void *rec,
struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
@@ -838,6 +876,7 @@ static struct {
{ trace_clock_jiffies, "uptime", 0 },
{ trace_clock, "perf", 1 },
{ ktime_get_mono_fast_ns, "mono", 1 },
+ { ktime_get_raw_fast_ns, "mono_raw", 1 },
ARCH_TRACE_CLOCKS
};
@@ -1655,13 +1694,13 @@ static struct ring_buffer *temp_buffer;
struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
- struct ftrace_event_file *ftrace_file,
+ struct trace_event_file *trace_file,
int type, unsigned long len,
unsigned long flags, int pc)
{
struct ring_buffer_event *entry;
- *current_rb = ftrace_file->tr->trace_buffer.buffer;
+ *current_rb = trace_file->tr->trace_buffer.buffer;
entry = trace_buffer_lock_reserve(*current_rb,
type, len, flags, pc);
/*
@@ -1670,7 +1709,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
* to store the trace event for the tigger to use. It's recusive
* safe and will not be recorded anywhere.
*/
- if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
*current_rb = temp_buffer;
entry = trace_buffer_lock_reserve(*current_rb,
type, len, flags, pc);
@@ -1722,7 +1761,7 @@ trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip, unsigned long flags,
int pc)
{
- struct ftrace_event_call *call = &event_function;
+ struct trace_event_call *call = &event_function;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
@@ -1757,7 +1796,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs)
{
- struct ftrace_event_call *call = &event_kernel_stack;
+ struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
struct stack_entry *entry;
struct stack_trace trace;
@@ -1885,7 +1924,7 @@ static DEFINE_PER_CPU(int, user_stack_count);
void
ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
{
- struct ftrace_event_call *call = &event_user_stack;
+ struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
struct stack_trace trace;
@@ -2091,7 +2130,7 @@ static void trace_printk_start_stop_comm(int enabled)
*/
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
- struct ftrace_event_call *call = &event_bprint;
+ struct trace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct trace_array *tr = &global_trace;
@@ -2149,7 +2188,7 @@ static int
__trace_array_vprintk(struct ring_buffer *buffer,
unsigned long ip, const char *fmt, va_list args)
{
- struct ftrace_event_call *call = &event_print;
+ struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
int len = 0, size, pc;
struct print_entry *entry;
@@ -3908,6 +3947,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
.write = tracing_saved_cmdlines_size_write,
};
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static union trace_enum_map_item *
+update_enum_map(union trace_enum_map_item *ptr)
+{
+ if (!ptr->map.enum_string) {
+ if (ptr->tail.next) {
+ ptr = ptr->tail.next;
+ /* Set ptr to the next real item (skip head) */
+ ptr++;
+ } else
+ return NULL;
+ }
+ return ptr;
+}
+
+static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ union trace_enum_map_item *ptr = v;
+
+ /*
+ * Paranoid! If ptr points to end, we don't want to increment past it.
+ * This really should never happen.
+ */
+ ptr = update_enum_map(ptr);
+ if (WARN_ON_ONCE(!ptr))
+ return NULL;
+
+ ptr++;
+
+ (*pos)++;
+
+ ptr = update_enum_map(ptr);
+
+ return ptr;
+}
+
+static void *enum_map_start(struct seq_file *m, loff_t *pos)
+{
+ union trace_enum_map_item *v;
+ loff_t l = 0;
+
+ mutex_lock(&trace_enum_mutex);
+
+ v = trace_enum_maps;
+ if (v)
+ v++;
+
+ while (v && l < *pos) {
+ v = enum_map_next(m, v, &l);
+ }
+
+ return v;
+}
+
+static void enum_map_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&trace_enum_mutex);
+}
+
+static int enum_map_show(struct seq_file *m, void *v)
+{
+ union trace_enum_map_item *ptr = v;
+
+ seq_printf(m, "%s %ld (%s)\n",
+ ptr->map.enum_string, ptr->map.enum_value,
+ ptr->map.system);
+
+ return 0;
+}
+
+static const struct seq_operations tracing_enum_map_seq_ops = {
+ .start = enum_map_start,
+ .next = enum_map_next,
+ .stop = enum_map_stop,
+ .show = enum_map_show,
+};
+
+static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ return seq_open(filp, &tracing_enum_map_seq_ops);
+}
+
+static const struct file_operations tracing_enum_map_fops = {
+ .open = tracing_enum_map_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static inline union trace_enum_map_item *
+trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+{
+ /* Return tail of array given the head */
+ return ptr + ptr->head.length + 1;
+}
+
+static void
+trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+ int len)
+{
+ struct trace_enum_map **stop;
+ struct trace_enum_map **map;
+ union trace_enum_map_item *map_array;
+ union trace_enum_map_item *ptr;
+
+ stop = start + len;
+
+ /*
+ * The trace_enum_maps contains the map plus a head and tail item,
+ * where the head holds the module and length of array, and the
+ * tail holds a pointer to the next list.
+ */
+ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
+ if (!map_array) {
+ pr_warning("Unable to allocate trace enum mapping\n");
+ return;
+ }
+
+ mutex_lock(&trace_enum_mutex);
+
+ if (!trace_enum_maps)
+ trace_enum_maps = map_array;
+ else {
+ ptr = trace_enum_maps;
+ for (;;) {
+ ptr = trace_enum_jmp_to_tail(ptr);
+ if (!ptr->tail.next)
+ break;
+ ptr = ptr->tail.next;
+
+ }
+ ptr->tail.next = map_array;
+ }
+ map_array->head.mod = mod;
+ map_array->head.length = len;
+ map_array++;
+
+ for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
+ map_array->map = **map;
+ map_array++;
+ }
+ memset(map_array, 0, sizeof(*map_array));
+
+ mutex_unlock(&trace_enum_mutex);
+}
+
+static void trace_create_enum_file(struct dentry *d_tracer)
+{
+ trace_create_file("enum_map", 0444, d_tracer,
+ NULL, &tracing_enum_map_fops);
+}
+
+#else /* CONFIG_TRACE_ENUM_MAP_FILE */
+static inline void trace_create_enum_file(struct dentry *d_tracer) { }
+static inline void trace_insert_enum_map_file(struct module *mod,
+ struct trace_enum_map **start, int len) { }
+#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+
+static void trace_insert_enum_map(struct module *mod,
+ struct trace_enum_map **start, int len)
+{
+ struct trace_enum_map **map;
+
+ if (len <= 0)
+ return;
+
+ map = start;
+
+ trace_event_enum_update(map, len);
+
+ trace_insert_enum_map_file(mod, start, len);
+}
+
static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -4105,9 +4320,24 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace = &nop_trace;
}
-static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+static void update_tracer_options(struct trace_array *tr, struct tracer *t)
{
static struct trace_option_dentry *topts;
+
+ /* Only enable if the directory has been created already. */
+ if (!tr->dir)
+ return;
+
+ /* Currently, only the top instance has options */
+ if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
+ return;
+
+ destroy_trace_option_files(topts);
+ topts = create_trace_option_files(tr, t);
+}
+
+static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+{
struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
@@ -4172,11 +4402,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
free_snapshot(tr);
}
#endif
- /* Currently, only the top instance has options */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
- destroy_trace_option_files(topts);
- topts = create_trace_option_files(tr, t);
- }
+ update_tracer_options(tr, t);
#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
@@ -5817,6 +6043,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
+ if (WARN_ON(!tr->dir))
+ return ERR_PTR(-ENODEV);
+
+ /* Top directory uses NULL as the parent */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return NULL;
+
+ /* All sub buffers have a descriptor */
return tr->dir;
}
@@ -5831,10 +6065,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
if (IS_ERR(d_tracer))
return NULL;
- tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
+ tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
WARN_ONCE(!tr->percpu_dir,
- "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
+ "Could not create tracefs directory 'per_cpu/%d'\n", cpu);
return tr->percpu_dir;
}
@@ -5846,12 +6080,12 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
struct dentry *ret = trace_create_file(name, mode, parent, data, fops);
if (ret) /* See tracing_get_cpu() */
- ret->d_inode->i_cdev = (void *)(cpu + 1);
+ d_inode(ret)->i_cdev = (void *)(cpu + 1);
return ret;
}
static void
-tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
+tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
{
struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
struct dentry *d_cpu;
@@ -5861,9 +6095,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
return;
snprintf(cpu_dir, 30, "cpu%ld", cpu);
- d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+ d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
if (!d_cpu) {
- pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+ pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
return;
}
@@ -6015,9 +6249,9 @@ struct dentry *trace_create_file(const char *name,
{
struct dentry *ret;
- ret = debugfs_create_file(name, mode, parent, data, fops);
+ ret = tracefs_create_file(name, mode, parent, data, fops);
if (!ret)
- pr_warning("Could not create debugfs '%s' entry\n", name);
+ pr_warning("Could not create tracefs '%s' entry\n", name);
return ret;
}
@@ -6034,9 +6268,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
if (IS_ERR(d_tracer))
return NULL;
- tr->options = debugfs_create_dir("options", d_tracer);
+ tr->options = tracefs_create_dir("options", d_tracer);
if (!tr->options) {
- pr_warning("Could not create debugfs directory 'options'\n");
+ pr_warning("Could not create tracefs directory 'options'\n");
return NULL;
}
@@ -6105,7 +6339,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
return;
for (cnt = 0; topts[cnt].opt; cnt++)
- debugfs_remove(topts[cnt].entry);
+ tracefs_remove(topts[cnt].entry);
kfree(topts);
}
@@ -6194,7 +6428,7 @@ static const struct file_operations rb_simple_fops = {
struct dentry *trace_instance_dir;
static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
static int
allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
@@ -6271,7 +6505,7 @@ static void free_trace_buffers(struct trace_array *tr)
#endif
}
-static int new_instance_create(const char *name)
+static int instance_mkdir(const char *name)
{
struct trace_array *tr;
int ret;
@@ -6310,17 +6544,17 @@ static int new_instance_create(const char *name)
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
- tr->dir = debugfs_create_dir(name, trace_instance_dir);
+ tr->dir = tracefs_create_dir(name, trace_instance_dir);
if (!tr->dir)
goto out_free_tr;
ret = event_trace_add_tracer(tr->dir, tr);
if (ret) {
- debugfs_remove_recursive(tr->dir);
+ tracefs_remove_recursive(tr->dir);
goto out_free_tr;
}
- init_tracer_debugfs(tr, tr->dir);
+ init_tracer_tracefs(tr, tr->dir);
list_add(&tr->list, &ftrace_trace_arrays);
@@ -6341,7 +6575,7 @@ static int new_instance_create(const char *name)
}
-static int instance_delete(const char *name)
+static int instance_rmdir(const char *name)
{
struct trace_array *tr;
int found = 0;
@@ -6382,82 +6616,17 @@ static int instance_delete(const char *name)
return ret;
}
-static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
-{
- struct dentry *parent;
- int ret;
-
- /* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
- if (WARN_ON_ONCE(parent != trace_instance_dir))
- return -ENOENT;
-
- /*
- * The inode mutex is locked, but debugfs_create_dir() will also
- * take the mutex. As the instances directory can not be destroyed
- * or changed in any other way, it is safe to unlock it, and
- * let the dentry try. If two users try to make the same dir at
- * the same time, then the new_instance_create() will determine the
- * winner.
- */
- mutex_unlock(&inode->i_mutex);
-
- ret = new_instance_create(dentry->d_iname);
-
- mutex_lock(&inode->i_mutex);
-
- return ret;
-}
-
-static int instance_rmdir(struct inode *inode, struct dentry *dentry)
-{
- struct dentry *parent;
- int ret;
-
- /* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
- if (WARN_ON_ONCE(parent != trace_instance_dir))
- return -ENOENT;
-
- /* The caller did a dget() on dentry */
- mutex_unlock(&dentry->d_inode->i_mutex);
-
- /*
- * The inode mutex is locked, but debugfs_create_dir() will also
- * take the mutex. As the instances directory can not be destroyed
- * or changed in any other way, it is safe to unlock it, and
- * let the dentry try. If two users try to make the same dir at
- * the same time, then the instance_delete() will determine the
- * winner.
- */
- mutex_unlock(&inode->i_mutex);
-
- ret = instance_delete(dentry->d_iname);
-
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock(&dentry->d_inode->i_mutex);
-
- return ret;
-}
-
-static const struct inode_operations instance_dir_inode_operations = {
- .lookup = simple_lookup,
- .mkdir = instance_mkdir,
- .rmdir = instance_rmdir,
-};
-
static __init void create_trace_instances(struct dentry *d_tracer)
{
- trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+ trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
+ instance_mkdir,
+ instance_rmdir);
if (WARN_ON(!trace_instance_dir))
return;
-
- /* Hijack the dir inode operations, to allow mkdir */
- trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
}
static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
int cpu;
@@ -6511,10 +6680,32 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
#endif
for_each_tracing_cpu(cpu)
- tracing_init_debugfs_percpu(tr, cpu);
+ tracing_init_tracefs_percpu(tr, cpu);
}
+static struct vfsmount *trace_automount(void *ingore)
+{
+ struct vfsmount *mnt;
+ struct file_system_type *type;
+
+ /*
+ * To maintain backward compatibility for tools that mount
+ * debugfs to get to the tracing facility, tracefs is automatically
+ * mounted to the debugfs/tracing directory.
+ */
+ type = get_fs_type("tracefs");
+ if (!type)
+ return NULL;
+ mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+ put_filesystem(type);
+ if (IS_ERR(mnt))
+ return NULL;
+ mntget(mnt);
+
+ return mnt;
+}
+
/**
* tracing_init_dentry - initialize top level trace array
*
@@ -6526,23 +6717,112 @@ struct dentry *tracing_init_dentry(void)
{
struct trace_array *tr = &global_trace;
+ /* The top level trace array uses NULL as parent */
if (tr->dir)
- return tr->dir;
+ return NULL;
if (WARN_ON(!debugfs_initialized()))
return ERR_PTR(-ENODEV);
- tr->dir = debugfs_create_dir("tracing", NULL);
-
+ /*
+ * As there may still be users that expect the tracing
+ * files to exist in debugfs/tracing, we must automount
+ * the tracefs file system there, so older tools still
+ * work with the newer kerenl.
+ */
+ tr->dir = debugfs_create_automount("tracing", NULL,
+ trace_automount, NULL);
if (!tr->dir) {
pr_warn_once("Could not create debugfs directory 'tracing'\n");
return ERR_PTR(-ENOMEM);
}
- return tr->dir;
+ return NULL;
+}
+
+extern struct trace_enum_map *__start_ftrace_enum_maps[];
+extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+
+static void __init trace_enum_init(void)
+{
+ int len;
+
+ len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
+ trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
+}
+
+#ifdef CONFIG_MODULES
+static void trace_module_add_enums(struct module *mod)
+{
+ if (!mod->num_trace_enums)
+ return;
+
+ /*
+ * Modules with bad taint do not have events created, do
+ * not bother with enums either.
+ */
+ if (trace_module_has_bad_taint(mod))
+ return;
+
+ trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
}
-static __init int tracer_init_debugfs(void)
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static void trace_module_remove_enums(struct module *mod)
+{
+ union trace_enum_map_item *map;
+ union trace_enum_map_item **last = &trace_enum_maps;
+
+ if (!mod->num_trace_enums)
+ return;
+
+ mutex_lock(&trace_enum_mutex);
+
+ map = trace_enum_maps;
+
+ while (map) {
+ if (map->head.mod == mod)
+ break;
+ map = trace_enum_jmp_to_tail(map);
+ last = &map->tail.next;
+ map = map->tail.next;
+ }
+ if (!map)
+ goto out;
+
+ *last = trace_enum_jmp_to_tail(map)->tail.next;
+ kfree(map);
+ out:
+ mutex_unlock(&trace_enum_mutex);
+}
+#else
+static inline void trace_module_remove_enums(struct module *mod) { }
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+
+static int trace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ trace_module_add_enums(mod);
+ break;
+ case MODULE_STATE_GOING:
+ trace_module_remove_enums(mod);
+ break;
+ }
+
+ return 0;
+}
+
+static struct notifier_block trace_module_nb = {
+ .notifier_call = trace_module_notify,
+ .priority = 0,
+};
+#endif /* CONFIG_MODULES */
+
+static __init int tracer_init_tracefs(void)
{
struct dentry *d_tracer;
@@ -6552,7 +6832,7 @@ static __init int tracer_init_debugfs(void)
if (IS_ERR(d_tracer))
return 0;
- init_tracer_debugfs(&global_trace, d_tracer);
+ init_tracer_tracefs(&global_trace, d_tracer);
trace_create_file("tracing_thresh", 0644, d_tracer,
&global_trace, &tracing_thresh_fops);
@@ -6566,6 +6846,14 @@ static __init int tracer_init_debugfs(void)
trace_create_file("saved_cmdlines_size", 0644, d_tracer,
NULL, &tracing_saved_cmdlines_size_fops);
+ trace_enum_init();
+
+ trace_create_enum_file(d_tracer);
+
+#ifdef CONFIG_MODULES
+ register_module_notifier(&trace_module_nb);
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6575,6 +6863,10 @@ static __init int tracer_init_debugfs(void)
create_trace_options_dir(&global_trace);
+ /* If the tracer was started via cmdline, create options for it here */
+ if (global_trace.current_trace != &nop_trace)
+ update_tracer_options(&global_trace, global_trace.current_trace);
+
return 0;
}
@@ -6888,7 +7180,7 @@ void __init trace_init(void)
tracepoint_printk = 0;
}
tracer_alloc_buffers();
- trace_event_init();
+ trace_event_init();
}
__init static int clear_boot_tracer(void)
@@ -6910,5 +7202,5 @@ __init static int clear_boot_tracer(void)
return 0;
}
-fs_initcall(tracer_init_debugfs);
+fs_initcall(tracer_init_tracefs);
late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dd8205a35760..74bde81601a9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -12,7 +12,7 @@
#include <linux/ftrace.h>
#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/compiler.h>
#include <linux/trace_seq.h>
@@ -211,8 +211,8 @@ struct trace_array {
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
int sys_refcount_exit;
- struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls];
- struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];
+ struct trace_event_file __rcu *enter_syscall_files[NR_syscalls];
+ struct trace_event_file __rcu *exit_syscall_files[NR_syscalls];
#endif
int stop_count;
int clock_id;
@@ -334,7 +334,7 @@ struct tracer_flags {
/**
- * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * struct tracer - a specific tracer and its callbacks to interact with tracefs
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
@@ -444,6 +444,7 @@ enum {
TRACE_CONTROL_BIT,
+ TRACE_BRANCH_BIT,
/*
* Abuse of the trace_recursion.
* As we need a way to maintain state if we are tracing the function
@@ -858,7 +859,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops);
#define ftrace_destroy_filter_files(ops) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
-int ftrace_event_is_function(struct ftrace_event_call *call);
+int ftrace_event_is_function(struct trace_event_call *call);
/*
* struct trace_parser - servers for reading the user input separated by spaces
@@ -992,7 +993,7 @@ struct event_subsystem {
int ref_count;
};
-struct ftrace_subsystem_dir {
+struct trace_subsystem_dir {
struct list_head list;
struct event_subsystem *subsystem;
struct trace_array *tr;
@@ -1052,30 +1053,30 @@ struct filter_pred {
extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
-extern void print_event_filter(struct ftrace_event_file *file,
+extern void print_event_filter(struct trace_event_file *file,
struct trace_seq *s);
-extern int apply_event_filter(struct ftrace_event_file *file,
+extern int apply_event_filter(struct trace_event_file *file,
char *filter_string);
-extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
+extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
char *filter_string);
extern void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s);
extern int filter_assign_type(const char *type);
-extern int create_event_filter(struct ftrace_event_call *call,
+extern int create_event_filter(struct trace_event_call *call,
char *filter_str, bool set_str,
struct event_filter **filterp);
extern void free_event_filter(struct event_filter *filter);
struct ftrace_event_field *
-trace_find_event_field(struct ftrace_event_call *call, char *name);
+trace_find_event_field(struct trace_event_call *call, char *name);
extern void trace_event_enable_cmd_record(bool enable);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
-extern struct ftrace_event_file *find_event_file(struct trace_array *tr,
- const char *system,
- const char *event);
+extern struct trace_event_file *find_event_file(struct trace_array *tr,
+ const char *system,
+ const char *event);
static inline void *event_file_data(struct file *filp)
{
@@ -1180,7 +1181,7 @@ struct event_trigger_ops {
* commands need to do this if they themselves log to the trace
* buffer (see the @post_trigger() member below). @trigger_type
* values are defined by adding new values to the trigger_type
- * enum in include/linux/ftrace_event.h.
+ * enum in include/linux/trace_events.h.
*
* @post_trigger: A flag that says whether or not this command needs
* to have its action delayed until after the current event has
@@ -1242,23 +1243,23 @@ struct event_command {
enum event_trigger_type trigger_type;
bool post_trigger;
int (*func)(struct event_command *cmd_ops,
- struct ftrace_event_file *file,
+ struct trace_event_file *file,
char *glob, char *cmd, char *params);
int (*reg)(char *glob,
struct event_trigger_ops *ops,
struct event_trigger_data *data,
- struct ftrace_event_file *file);
+ struct trace_event_file *file);
void (*unreg)(char *glob,
struct event_trigger_ops *ops,
struct event_trigger_data *data,
- struct ftrace_event_file *file);
+ struct trace_event_file *file);
int (*set_filter)(char *filter_str,
struct event_trigger_data *data,
- struct ftrace_event_file *file);
+ struct trace_event_file *file);
struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
};
-extern int trace_event_enable_disable(struct ftrace_event_file *file,
+extern int trace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable);
extern int tracing_alloc_snapshot(void);
@@ -1286,7 +1287,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
- extern struct ftrace_event_call \
+ extern struct trace_event_call \
__aligned(4) event_##call;
#undef FTRACE_ENTRY_DUP
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
@@ -1295,7 +1296,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
#include "trace_entries.h"
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
-int perf_ftrace_event_register(struct ftrace_event_call *call,
+int perf_ftrace_event_register(struct trace_event_call *call,
enum trace_reg type, void *data);
#else
#define perf_ftrace_event_register NULL
@@ -1309,8 +1310,10 @@ static inline void init_ftrace_syscalls(void) { }
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
+void trace_event_enum_update(struct trace_enum_map **map, int len);
#else
static inline void __init trace_event_init(void) { }
+static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { }
#endif
extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 57cbf1efdd44..e2e12ad3186f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -29,16 +29,19 @@ static struct trace_array *branch_tracer;
static void
probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
{
- struct ftrace_event_call *call = &event_branch;
+ struct trace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
struct trace_array_cpu *data;
struct ring_buffer_event *event;
struct trace_branch *entry;
struct ring_buffer *buffer;
unsigned long flags;
- int cpu, pc;
+ int pc;
const char *p;
+ if (current->trace_recursion & TRACE_BRANCH_BIT)
+ return;
+
/*
* I would love to save just the ftrace_likely_data pointer, but
* this code can also be used by modules. Ugly things can happen
@@ -49,10 +52,10 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
if (unlikely(!tr))
return;
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(tr->trace_buffer.data, cpu);
- if (atomic_inc_return(&data->disabled) != 1)
+ raw_local_irq_save(flags);
+ current->trace_recursion |= TRACE_BRANCH_BIT;
+ data = this_cpu_ptr(tr->trace_buffer.data);
+ if (atomic_read(&data->disabled))
goto out;
pc = preempt_count();
@@ -81,8 +84,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
__buffer_unlock_commit(buffer, event);
out:
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ current->trace_recursion &= ~TRACE_BRANCH_BIT;
+ raw_local_irq_restore(flags);
}
static inline
@@ -191,7 +194,7 @@ __init static int init_branch_tracer(void)
{
int ret;
- ret = register_ftrace_event(&trace_branch_event);
+ ret = register_trace_event(&trace_branch_event);
if (!ret) {
printk(KERN_WARNING "Warning: could not register "
"branch events\n");
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 57b67b1f24d1..0f06532a755b 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -56,6 +56,7 @@ u64 notrace trace_clock(void)
{
return local_clock();
}
+EXPORT_SYMBOL_GPL(trace_clock);
/*
* trace_jiffy_clock(): Simply use jiffies as a clock counter.
@@ -68,6 +69,7 @@ u64 notrace trace_clock_jiffies(void)
{
return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
}
+EXPORT_SYMBOL_GPL(trace_clock_jiffies);
/*
* trace_clock_global(): special globally coherent trace clock
@@ -123,6 +125,7 @@ u64 notrace trace_clock_global(void)
return now;
}
+EXPORT_SYMBOL_GPL(trace_clock_global);
static atomic64_t trace_counter;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e2d027ac66a2..ee7b94a4810a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
__dynamic_array( u32, buf )
),
- F_printk("%pf: %s",
+ F_printk("%ps: %s",
(void *)__entry->ip, __entry->fmt),
FILTER_OTHER
@@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry,
__dynamic_array( char, buf )
),
- F_printk("%pf: %s",
+ F_printk("%ps: %s",
(void *)__entry->ip, __entry->buf),
FILTER_OTHER
@@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry,
__field( const char *, str )
),
- F_printk("%pf: %s",
+ F_printk("%ps: %s",
(void *)__entry->ip, __entry->str),
FILTER_OTHER
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 6fa484de2ba1..abfc903e741e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,7 +21,7 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
/* Count the events in use (per event id, not per instance) */
static int total_ref_count;
-static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+static int perf_trace_event_perm(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
if (tp_event->perf_perm) {
@@ -83,7 +83,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
return 0;
}
-static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
+static int perf_trace_event_reg(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
struct hlist_head __percpu *list;
@@ -143,7 +143,7 @@ fail:
static void perf_trace_event_unreg(struct perf_event *p_event)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct trace_event_call *tp_event = p_event->tp_event;
int i;
if (--tp_event->perf_refcount > 0)
@@ -172,17 +172,17 @@ out:
static int perf_trace_event_open(struct perf_event *p_event)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct trace_event_call *tp_event = p_event->tp_event;
return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
}
static void perf_trace_event_close(struct perf_event *p_event)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct trace_event_call *tp_event = p_event->tp_event;
tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
}
-static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+static int perf_trace_event_init(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
int ret;
@@ -206,7 +206,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
int perf_trace_init(struct perf_event *p_event)
{
- struct ftrace_event_call *tp_event;
+ struct trace_event_call *tp_event;
u64 event_id = p_event->attr.config;
int ret = -EINVAL;
@@ -236,7 +236,7 @@ void perf_trace_destroy(struct perf_event *p_event)
int perf_trace_add(struct perf_event *p_event, int flags)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct trace_event_call *tp_event = p_event->tp_event;
struct hlist_head __percpu *pcpu_list;
struct hlist_head *list;
@@ -255,7 +255,7 @@ int perf_trace_add(struct perf_event *p_event, int flags)
void perf_trace_del(struct perf_event *p_event, int flags)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct trace_event_call *tp_event = p_event->tp_event;
hlist_del_rcu(&p_event->hlist_entry);
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}
@@ -357,7 +357,7 @@ static void perf_ftrace_function_disable(struct perf_event *event)
ftrace_function_local_disable(&event->ftrace_ops);
}
-int perf_ftrace_event_register(struct ftrace_event_call *call,
+int perf_ftrace_event_register(struct trace_event_call *call,
enum trace_reg type, void *data)
{
switch (type) {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index db54dda10ccc..404a372ad85a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
@@ -61,14 +61,14 @@ static int system_refcount_dec(struct event_subsystem *system)
#define do_for_each_event_file_safe(tr, file) \
list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
- struct ftrace_event_file *___n; \
+ struct trace_event_file *___n; \
list_for_each_entry_safe(file, ___n, &tr->events, list)
#define while_for_each_event_file() \
}
static struct list_head *
-trace_get_fields(struct ftrace_event_call *event_call)
+trace_get_fields(struct trace_event_call *event_call)
{
if (!event_call->class->get_fields)
return &event_call->class->fields;
@@ -89,7 +89,7 @@ __find_event_field(struct list_head *head, char *name)
}
struct ftrace_event_field *
-trace_find_event_field(struct ftrace_event_call *call, char *name)
+trace_find_event_field(struct trace_event_call *call, char *name)
{
struct ftrace_event_field *field;
struct list_head *head;
@@ -129,7 +129,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
return 0;
}
-int trace_define_field(struct ftrace_event_call *call, const char *type,
+int trace_define_field(struct trace_event_call *call, const char *type,
const char *name, int offset, int size, int is_signed,
int filter_type)
{
@@ -166,7 +166,7 @@ static int trace_define_common_fields(void)
return ret;
}
-static void trace_destroy_fields(struct ftrace_event_call *call)
+static void trace_destroy_fields(struct trace_event_call *call)
{
struct ftrace_event_field *field, *next;
struct list_head *head;
@@ -178,11 +178,11 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
}
}
-int trace_event_raw_init(struct ftrace_event_call *call)
+int trace_event_raw_init(struct trace_event_call *call)
{
int id;
- id = register_ftrace_event(&call->event);
+ id = register_trace_event(&call->event);
if (!id)
return -ENODEV;
@@ -190,18 +190,18 @@ int trace_event_raw_init(struct ftrace_event_call *call)
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
-void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
- struct ftrace_event_file *ftrace_file,
- unsigned long len)
+void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
+ struct trace_event_file *trace_file,
+ unsigned long len)
{
- struct ftrace_event_call *event_call = ftrace_file->event_call;
+ struct trace_event_call *event_call = trace_file->event_call;
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
- fbuffer->ftrace_file = ftrace_file;
+ fbuffer->trace_file = trace_file;
fbuffer->event =
- trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
+ trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
event_call->event.type, len,
fbuffer->flags, fbuffer->pc);
if (!fbuffer->event)
@@ -210,13 +210,13 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
fbuffer->entry = ring_buffer_event_data(fbuffer->event);
return fbuffer->entry;
}
-EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+EXPORT_SYMBOL_GPL(trace_event_buffer_reserve);
static DEFINE_SPINLOCK(tracepoint_iter_lock);
-static void output_printk(struct ftrace_event_buffer *fbuffer)
+static void output_printk(struct trace_event_buffer *fbuffer)
{
- struct ftrace_event_call *event_call;
+ struct trace_event_call *event_call;
struct trace_event *event;
unsigned long flags;
struct trace_iterator *iter = tracepoint_print_iter;
@@ -224,12 +224,12 @@ static void output_printk(struct ftrace_event_buffer *fbuffer)
if (!iter)
return;
- event_call = fbuffer->ftrace_file->event_call;
+ event_call = fbuffer->trace_file->event_call;
if (!event_call || !event_call->event.funcs ||
!event_call->event.funcs->trace)
return;
- event = &fbuffer->ftrace_file->event_call->event;
+ event = &fbuffer->trace_file->event_call->event;
spin_lock_irqsave(&tracepoint_iter_lock, flags);
trace_seq_init(&iter->seq);
@@ -241,21 +241,21 @@ static void output_printk(struct ftrace_event_buffer *fbuffer)
spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}
-void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
+void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
{
if (tracepoint_printk)
output_printk(fbuffer);
- event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
+ event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
fbuffer->flags, fbuffer->pc);
}
-EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
+EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
-int ftrace_event_reg(struct ftrace_event_call *call,
- enum trace_reg type, void *data)
+int trace_event_reg(struct trace_event_call *call,
+ enum trace_reg type, void *data)
{
- struct ftrace_event_file *file = data;
+ struct trace_event_file *file = data;
WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
switch (type) {
@@ -288,34 +288,34 @@ int ftrace_event_reg(struct ftrace_event_call *call,
}
return 0;
}
-EXPORT_SYMBOL_GPL(ftrace_event_reg);
+EXPORT_SYMBOL_GPL(trace_event_reg);
void trace_event_enable_cmd_record(bool enable)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct trace_array *tr;
mutex_lock(&event_mutex);
do_for_each_event_file(tr, file) {
- if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
+ if (!(file->flags & EVENT_FILE_FL_ENABLED))
continue;
if (enable) {
tracing_start_cmdline_record();
- set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
} else {
tracing_stop_cmdline_record();
- clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
} while_for_each_event_file();
mutex_unlock(&event_mutex);
}
-static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
+static int __ftrace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
int ret = 0;
int disable;
@@ -337,24 +337,24 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
if (soft_disable) {
if (atomic_dec_return(&file->sm_ref) > 0)
break;
- disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
- clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+ disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
+ clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
} else
- disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
+ disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
- if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
- clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
- if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
+ if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
+ clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
+ if (file->flags & EVENT_FILE_FL_RECORDED_CMD) {
tracing_stop_cmdline_record();
- clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
call->class->reg(call, TRACE_REG_UNREGISTER, file);
}
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
- if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
- set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ if (file->flags & EVENT_FILE_FL_SOFT_MODE)
+ set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
else
- clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
break;
case 1:
/*
@@ -366,31 +366,31 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
* it still seems to be disabled.
*/
if (!soft_disable)
- clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
else {
if (atomic_inc_return(&file->sm_ref) > 1)
break;
- set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
}
- if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
+ if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
/* Keep the event disabled, when going to SOFT_MODE. */
if (soft_disable)
- set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
if (trace_flags & TRACE_ITER_RECORD_CMD) {
tracing_start_cmdline_record();
- set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
ret = call->class->reg(call, TRACE_REG_REGISTER, file);
if (ret) {
tracing_stop_cmdline_record();
pr_info("event trace: Could not enable event "
- "%s\n", ftrace_event_name(call));
+ "%s\n", trace_event_name(call));
break;
}
- set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
/* WAS_ENABLED gets set but never cleared. */
call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
@@ -401,13 +401,13 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
return ret;
}
-int trace_event_enable_disable(struct ftrace_event_file *file,
+int trace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable)
{
return __ftrace_event_enable_disable(file, enable, soft_disable);
}
-static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+static int ftrace_event_enable_disable(struct trace_event_file *file,
int enable)
{
return __ftrace_event_enable_disable(file, enable, 0);
@@ -415,7 +415,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file,
static void ftrace_clear_events(struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
mutex_lock(&event_mutex);
list_for_each_entry(file, &tr->events, list) {
@@ -449,14 +449,14 @@ static void __get_system(struct event_subsystem *system)
system_refcount_inc(system);
}
-static void __get_system_dir(struct ftrace_subsystem_dir *dir)
+static void __get_system_dir(struct trace_subsystem_dir *dir)
{
WARN_ON_ONCE(dir->ref_count == 0);
dir->ref_count++;
__get_system(dir->subsystem);
}
-static void __put_system_dir(struct ftrace_subsystem_dir *dir)
+static void __put_system_dir(struct trace_subsystem_dir *dir)
{
WARN_ON_ONCE(dir->ref_count == 0);
/* If the subsystem is about to be freed, the dir must be too */
@@ -467,26 +467,26 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir)
kfree(dir);
}
-static void put_system(struct ftrace_subsystem_dir *dir)
+static void put_system(struct trace_subsystem_dir *dir)
{
mutex_lock(&event_mutex);
__put_system_dir(dir);
mutex_unlock(&event_mutex);
}
-static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+static void remove_subsystem(struct trace_subsystem_dir *dir)
{
if (!dir)
return;
if (!--dir->nr_events) {
- debugfs_remove_recursive(dir->entry);
+ tracefs_remove_recursive(dir->entry);
list_del(&dir->list);
__put_system_dir(dir);
}
}
-static void remove_event_file_dir(struct ftrace_event_file *file)
+static void remove_event_file_dir(struct trace_event_file *file)
{
struct dentry *dir = file->dir;
struct dentry *child;
@@ -494,12 +494,12 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
if (dir) {
spin_lock(&dir->d_lock); /* probably unneeded */
list_for_each_entry(child, &dir->d_subdirs, d_child) {
- if (child->d_inode) /* probably unneeded */
- child->d_inode->i_private = NULL;
+ if (d_really_is_positive(child)) /* probably unneeded */
+ d_inode(child)->i_private = NULL;
}
spin_unlock(&dir->d_lock);
- debugfs_remove_recursive(dir);
+ tracefs_remove_recursive(dir);
}
list_del(&file->list);
@@ -515,15 +515,15 @@ static int
__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
const char *sub, const char *event, int set)
{
- struct ftrace_event_file *file;
- struct ftrace_event_call *call;
+ struct trace_event_file *file;
+ struct trace_event_call *call;
const char *name;
int ret = -EINVAL;
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- name = ftrace_event_name(call);
+ name = trace_event_name(call);
if (!name || !call->class || !call->class->reg)
continue;
@@ -565,6 +565,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
char *event = NULL, *sub = NULL, *match;
+ int ret;
/*
* The buf format can be <subsystem>:<event-name>
@@ -590,7 +591,13 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
event = NULL;
}
- return __ftrace_set_clr_event(tr, match, sub, event, set);
+ ret = __ftrace_set_clr_event(tr, match, sub, event, set);
+
+ /* Put back the colon to allow this to be called again */
+ if (buf)
+ *(buf - 1) = ':';
+
+ return ret;
}
/**
@@ -664,8 +671,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_file *file = v;
- struct ftrace_event_call *call;
+ struct trace_event_file *file = v;
+ struct trace_event_call *call;
struct trace_array *tr = m->private;
(*pos)++;
@@ -685,13 +692,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
static void *t_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct trace_array *tr = m->private;
loff_t l;
mutex_lock(&event_mutex);
- file = list_entry(&tr->events, struct ftrace_event_file, list);
+ file = list_entry(&tr->events, struct trace_event_file, list);
for (l = 0; l <= *pos; ) {
file = t_next(m, file, &l);
if (!file)
@@ -703,13 +710,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_file *file = v;
+ struct trace_event_file *file = v;
struct trace_array *tr = m->private;
(*pos)++;
list_for_each_entry_continue(file, &tr->events, list) {
- if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ if (file->flags & EVENT_FILE_FL_ENABLED)
return file;
}
@@ -718,13 +725,13 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
static void *s_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct trace_array *tr = m->private;
loff_t l;
mutex_lock(&event_mutex);
- file = list_entry(&tr->events, struct ftrace_event_file, list);
+ file = list_entry(&tr->events, struct trace_event_file, list);
for (l = 0; l <= *pos; ) {
file = s_next(m, file, &l);
if (!file)
@@ -735,12 +742,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
static int t_show(struct seq_file *m, void *v)
{
- struct ftrace_event_file *file = v;
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_file *file = v;
+ struct trace_event_call *call = file->event_call;
if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
seq_printf(m, "%s:", call->class->system);
- seq_printf(m, "%s\n", ftrace_event_name(call));
+ seq_printf(m, "%s\n", trace_event_name(call));
return 0;
}
@@ -754,7 +761,7 @@ static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
unsigned long flags;
char buf[4] = "0";
@@ -767,12 +774,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
if (!file)
return -ENODEV;
- if (flags & FTRACE_EVENT_FL_ENABLED &&
- !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ if (flags & EVENT_FILE_FL_ENABLED &&
+ !(flags & EVENT_FILE_FL_SOFT_DISABLED))
strcpy(buf, "1");
- if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
- flags & FTRACE_EVENT_FL_SOFT_MODE)
+ if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
+ flags & EVENT_FILE_FL_SOFT_MODE)
strcat(buf, "*");
strcat(buf, "\n");
@@ -784,7 +791,7 @@ static ssize_t
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
unsigned long val;
int ret;
@@ -821,10 +828,10 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
const char set_to_char[4] = { '?', '0', '1', 'X' };
- struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct trace_subsystem_dir *dir = filp->private_data;
struct event_subsystem *system = dir->subsystem;
- struct ftrace_event_call *call;
- struct ftrace_event_file *file;
+ struct trace_event_call *call;
+ struct trace_event_file *file;
struct trace_array *tr = dir->tr;
char buf[2];
int set = 0;
@@ -833,7 +840,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- if (!ftrace_event_name(call) || !call->class || !call->class->reg)
+ if (!trace_event_name(call) || !call->class || !call->class->reg)
continue;
if (system && strcmp(call->class->system, system->name) != 0)
@@ -844,7 +851,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
* or if all events or cleared, or if we have
* a mixture.
*/
- set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
+ set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED));
/*
* If we have a mixture, no need to look further.
@@ -866,7 +873,7 @@ static ssize_t
system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct trace_subsystem_dir *dir = filp->private_data;
struct event_subsystem *system = dir->subsystem;
const char *name = NULL;
unsigned long val;
@@ -910,7 +917,7 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = event_file_data(m->private);
+ struct trace_event_call *call = event_file_data(m->private);
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
@@ -942,13 +949,13 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct ftrace_event_call *call = event_file_data(m->private);
+ struct trace_event_call *call = event_file_data(m->private);
struct ftrace_event_field *field;
const char *array_descriptor;
switch ((unsigned long)v) {
case FORMAT_HEADER:
- seq_printf(m, "name: %s\n", ftrace_event_name(call));
+ seq_printf(m, "name: %s\n", trace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
seq_puts(m, "format:\n");
return 0;
@@ -1055,7 +1062,7 @@ static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct trace_seq *s;
int r = -ENODEV;
@@ -1088,7 +1095,7 @@ static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
char *buf;
int err = -ENODEV;
@@ -1125,7 +1132,7 @@ static LIST_HEAD(event_subsystems);
static int subsystem_open(struct inode *inode, struct file *filp)
{
struct event_subsystem *system = NULL;
- struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
+ struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */
struct trace_array *tr;
int ret;
@@ -1174,7 +1181,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
static int system_tr_open(struct inode *inode, struct file *filp)
{
- struct ftrace_subsystem_dir *dir;
+ struct trace_subsystem_dir *dir;
struct trace_array *tr = inode->i_private;
int ret;
@@ -1207,7 +1214,7 @@ static int system_tr_open(struct inode *inode, struct file *filp)
static int subsystem_release(struct inode *inode, struct file *file)
{
- struct ftrace_subsystem_dir *dir = file->private_data;
+ struct trace_subsystem_dir *dir = file->private_data;
trace_array_put(dir->tr);
@@ -1228,7 +1235,7 @@ static ssize_t
subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct trace_subsystem_dir *dir = filp->private_data;
struct event_subsystem *system = dir->subsystem;
struct trace_seq *s;
int r;
@@ -1255,7 +1262,7 @@ static ssize_t
subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct trace_subsystem_dir *dir = filp->private_data;
char *buf;
int err;
@@ -1490,9 +1497,9 @@ create_new_subsystem(const char *name)
static struct dentry *
event_subsystem_dir(struct trace_array *tr, const char *name,
- struct ftrace_event_file *file, struct dentry *parent)
+ struct trace_event_file *file, struct dentry *parent)
{
- struct ftrace_subsystem_dir *dir;
+ struct trace_subsystem_dir *dir;
struct event_subsystem *system;
struct dentry *entry;
@@ -1526,7 +1533,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- dir->entry = debugfs_create_dir(name, parent);
+ dir->entry = tracefs_create_dir(name, parent);
if (!dir->entry) {
pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
@@ -1539,12 +1546,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
dir->subsystem = system;
file->system = dir;
- entry = debugfs_create_file("filter", 0644, dir->entry, dir,
+ entry = tracefs_create_file("filter", 0644, dir->entry, dir,
&ftrace_subsystem_filter_fops);
if (!entry) {
kfree(system->filter);
system->filter = NULL;
- pr_warn("Could not create debugfs '%s/filter' entry\n", name);
+ pr_warn("Could not create tracefs '%s/filter' entry\n", name);
}
trace_create_file("enable", 0644, dir->entry, dir,
@@ -1564,9 +1571,9 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
}
static int
-event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
+event_create_dir(struct dentry *parent, struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
struct list_head *head;
struct dentry *d_events;
@@ -1584,10 +1591,10 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
} else
d_events = parent;
- name = ftrace_event_name(call);
- file->dir = debugfs_create_dir(name, d_events);
+ name = trace_event_name(call);
+ file->dir = tracefs_create_dir(name, d_events);
if (!file->dir) {
- pr_warn("Could not create debugfs '%s' directory\n", name);
+ pr_warn("Could not create tracefs '%s' directory\n", name);
return -1;
}
@@ -1627,9 +1634,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
return 0;
}
-static void remove_event_from_tracers(struct ftrace_event_call *call)
+static void remove_event_from_tracers(struct trace_event_call *call)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct trace_array *tr;
do_for_each_event_file_safe(tr, file) {
@@ -1647,10 +1654,10 @@ static void remove_event_from_tracers(struct ftrace_event_call *call)
} while_for_each_event_file();
}
-static void event_remove(struct ftrace_event_call *call)
+static void event_remove(struct trace_event_call *call)
{
struct trace_array *tr;
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
do_for_each_event_file(tr, file) {
if (file->event_call != call)
@@ -1666,17 +1673,17 @@ static void event_remove(struct ftrace_event_call *call)
} while_for_each_event_file();
if (call->event.funcs)
- __unregister_ftrace_event(&call->event);
+ __unregister_trace_event(&call->event);
remove_event_from_tracers(call);
list_del(&call->list);
}
-static int event_init(struct ftrace_event_call *call)
+static int event_init(struct trace_event_call *call)
{
int ret = 0;
const char *name;
- name = ftrace_event_name(call);
+ name = trace_event_name(call);
if (WARN_ON(!name))
return -EINVAL;
@@ -1690,7 +1697,7 @@ static int event_init(struct ftrace_event_call *call)
}
static int
-__register_event(struct ftrace_event_call *call, struct module *mod)
+__register_event(struct trace_event_call *call, struct module *mod)
{
int ret;
@@ -1704,11 +1711,136 @@ __register_event(struct ftrace_event_call *call, struct module *mod)
return 0;
}
-static struct ftrace_event_file *
-trace_create_new_event(struct ftrace_event_call *call,
+static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+{
+ int rlen;
+ int elen;
+
+ /* Find the length of the enum value as a string */
+ elen = snprintf(ptr, 0, "%ld", map->enum_value);
+ /* Make sure there's enough room to replace the string with the value */
+ if (len < elen)
+ return NULL;
+
+ snprintf(ptr, elen + 1, "%ld", map->enum_value);
+
+ /* Get the rest of the string of ptr */
+ rlen = strlen(ptr + len);
+ memmove(ptr + elen, ptr + len, rlen);
+ /* Make sure we end the new string */
+ ptr[elen + rlen] = 0;
+
+ return ptr + elen;
+}
+
+static void update_event_printk(struct trace_event_call *call,
+ struct trace_enum_map *map)
+{
+ char *ptr;
+ int quote = 0;
+ int len = strlen(map->enum_string);
+
+ for (ptr = call->print_fmt; *ptr; ptr++) {
+ if (*ptr == '\\') {
+ ptr++;
+ /* paranoid */
+ if (!*ptr)
+ break;
+ continue;
+ }
+ if (*ptr == '"') {
+ quote ^= 1;
+ continue;
+ }
+ if (quote)
+ continue;
+ if (isdigit(*ptr)) {
+ /* skip numbers */
+ do {
+ ptr++;
+ /* Check for alpha chars like ULL */
+ } while (isalnum(*ptr));
+ if (!*ptr)
+ break;
+ /*
+ * A number must have some kind of delimiter after
+ * it, and we can ignore that too.
+ */
+ continue;
+ }
+ if (isalpha(*ptr) || *ptr == '_') {
+ if (strncmp(map->enum_string, ptr, len) == 0 &&
+ !isalnum(ptr[len]) && ptr[len] != '_') {
+ ptr = enum_replace(ptr, map, len);
+ /* Hmm, enum string smaller than value */
+ if (WARN_ON_ONCE(!ptr))
+ return;
+ /*
+ * No need to decrement here, as enum_replace()
+ * returns the pointer to the character passed
+ * the enum, and two enums can not be placed
+ * back to back without something in between.
+ * We can skip that something in between.
+ */
+ continue;
+ }
+ skip_more:
+ do {
+ ptr++;
+ } while (isalnum(*ptr) || *ptr == '_');
+ if (!*ptr)
+ break;
+ /*
+ * If what comes after this variable is a '.' or
+ * '->' then we can continue to ignore that string.
+ */
+ if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
+ ptr += *ptr == '.' ? 1 : 2;
+ if (!*ptr)
+ break;
+ goto skip_more;
+ }
+ /*
+ * Once again, we can skip the delimiter that came
+ * after the string.
+ */
+ continue;
+ }
+ }
+}
+
+void trace_event_enum_update(struct trace_enum_map **map, int len)
+{
+ struct trace_event_call *call, *p;
+ const char *last_system = NULL;
+ int last_i;
+ int i;
+
+ down_write(&trace_event_sem);
+ list_for_each_entry_safe(call, p, &ftrace_events, list) {
+ /* events are usually grouped together with systems */
+ if (!last_system || call->class->system != last_system) {
+ last_i = 0;
+ last_system = call->class->system;
+ }
+
+ for (i = last_i; i < len; i++) {
+ if (call->class->system == map[i]->system) {
+ /* Save the first system if need be */
+ if (!last_i)
+ last_i = i;
+ update_event_printk(call, map[i]);
+ }
+ }
+ }
+ up_write(&trace_event_sem);
+}
+
+static struct trace_event_file *
+trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
@@ -1726,9 +1858,9 @@ trace_create_new_event(struct ftrace_event_call *call,
/* Add an event to a trace directory */
static int
-__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
+__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
file = trace_create_new_event(call, tr);
if (!file)
@@ -1743,10 +1875,10 @@ __trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
* the filesystem is initialized.
*/
static __init int
-__trace_early_add_new_event(struct ftrace_event_call *call,
+__trace_early_add_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
file = trace_create_new_event(call, tr);
if (!file)
@@ -1756,10 +1888,10 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
}
struct ftrace_module_file_ops;
-static void __add_event_to_tracers(struct ftrace_event_call *call);
+static void __add_event_to_tracers(struct trace_event_call *call);
/* Add an additional event_call dynamically */
-int trace_add_event_call(struct ftrace_event_call *call)
+int trace_add_event_call(struct trace_event_call *call)
{
int ret;
mutex_lock(&trace_types_lock);
@@ -1778,7 +1910,7 @@ int trace_add_event_call(struct ftrace_event_call *call)
* Must be called under locking of trace_types_lock, event_mutex and
* trace_event_sem.
*/
-static void __trace_remove_event_call(struct ftrace_event_call *call)
+static void __trace_remove_event_call(struct trace_event_call *call)
{
event_remove(call);
trace_destroy_fields(call);
@@ -1786,10 +1918,10 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
call->filter = NULL;
}
-static int probe_remove_event_call(struct ftrace_event_call *call)
+static int probe_remove_event_call(struct trace_event_call *call)
{
struct trace_array *tr;
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
#ifdef CONFIG_PERF_EVENTS
if (call->perf_refcount)
@@ -1800,10 +1932,10 @@ static int probe_remove_event_call(struct ftrace_event_call *call)
continue;
/*
* We can't rely on ftrace_event_enable_disable(enable => 0)
- * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
+ * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
* TRACE_REG_UNREGISTER.
*/
- if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ if (file->flags & EVENT_FILE_FL_ENABLED)
return -EBUSY;
/*
* The do_for_each_event_file_safe() is
@@ -1820,7 +1952,7 @@ static int probe_remove_event_call(struct ftrace_event_call *call)
}
/* Remove an event_call */
-int trace_remove_event_call(struct ftrace_event_call *call)
+int trace_remove_event_call(struct trace_event_call *call)
{
int ret;
@@ -1844,7 +1976,7 @@ int trace_remove_event_call(struct ftrace_event_call *call)
static void trace_module_add_events(struct module *mod)
{
- struct ftrace_event_call **call, **start, **end;
+ struct trace_event_call **call, **start, **end;
if (!mod->num_trace_events)
return;
@@ -1867,7 +1999,7 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
- struct ftrace_event_call *call, *p;
+ struct trace_event_call *call, *p;
bool clear_trace = false;
down_write(&trace_event_sem);
@@ -1915,7 +2047,7 @@ static int trace_module_notify(struct notifier_block *self,
static struct notifier_block trace_module_nb = {
.notifier_call = trace_module_notify,
- .priority = 0,
+ .priority = 1, /* higher than trace.c module notify */
};
#endif /* CONFIG_MODULES */
@@ -1923,28 +2055,28 @@ static struct notifier_block trace_module_nb = {
static void
__trace_add_event_dirs(struct trace_array *tr)
{
- struct ftrace_event_call *call;
+ struct trace_event_call *call;
int ret;
list_for_each_entry(call, &ftrace_events, list) {
ret = __trace_add_new_event(call, tr);
if (ret < 0)
pr_warn("Could not create directory for event %s\n",
- ftrace_event_name(call));
+ trace_event_name(call));
}
}
-struct ftrace_event_file *
+struct trace_event_file *
find_event_file(struct trace_array *tr, const char *system, const char *event)
{
- struct ftrace_event_file *file;
- struct ftrace_event_call *call;
+ struct trace_event_file *file;
+ struct trace_event_call *call;
const char *name;
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- name = ftrace_event_name(call);
+ name = trace_event_name(call);
if (!name || !call->class || !call->class->reg)
continue;
@@ -1966,7 +2098,7 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
#define DISABLE_EVENT_STR "disable_event"
struct event_probe_data {
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
unsigned long count;
int ref;
bool enable;
@@ -1982,9 +2114,9 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
return;
if (data->enable)
- clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+ clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
else
- set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+ set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
}
static void
@@ -2000,7 +2132,7 @@ event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data
return;
/* Skip if the event is in a state we want to switch to */
- if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
return;
if (data->count != -1)
@@ -2020,7 +2152,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
seq_printf(m, "%s:%s:%s",
data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
data->file->event_call->class->system,
- ftrace_event_name(data->file->event_call));
+ trace_event_name(data->file->event_call));
if (data->count == -1)
seq_puts(m, ":unlimited\n");
@@ -2094,7 +2226,7 @@ event_enable_func(struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enabled)
{
struct trace_array *tr = top_trace_array();
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct ftrace_probe_ops *ops;
struct event_probe_data *data;
const char *system;
@@ -2226,16 +2358,16 @@ static inline int register_event_cmds(void) { return 0; }
#endif /* CONFIG_DYNAMIC_FTRACE */
/*
- * The top level array has already had its ftrace_event_file
+ * The top level array has already had its trace_event_file
* descriptors created in order to allow for early events to
- * be recorded. This function is called after the debugfs has been
+ * be recorded. This function is called after the tracefs has been
* initialized, and we now have to create the files associated
* to the events.
*/
static __init void
__trace_early_add_event_dirs(struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
int ret;
@@ -2243,7 +2375,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
ret = event_create_dir(tr->event_dir, file);
if (ret < 0)
pr_warn("Could not create directory for event %s\n",
- ftrace_event_name(file->event_call));
+ trace_event_name(file->event_call));
}
}
@@ -2256,7 +2388,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
static __init void
__trace_early_add_events(struct trace_array *tr)
{
- struct ftrace_event_call *call;
+ struct trace_event_call *call;
int ret;
list_for_each_entry(call, &ftrace_events, list) {
@@ -2267,7 +2399,7 @@ __trace_early_add_events(struct trace_array *tr)
ret = __trace_early_add_new_event(call, tr);
if (ret < 0)
pr_warn("Could not create early event %s\n",
- ftrace_event_name(call));
+ trace_event_name(call));
}
}
@@ -2275,13 +2407,13 @@ __trace_early_add_events(struct trace_array *tr)
static void
__trace_remove_event_dirs(struct trace_array *tr)
{
- struct ftrace_event_file *file, *next;
+ struct trace_event_file *file, *next;
list_for_each_entry_safe(file, next, &tr->events, list)
remove_event_file_dir(file);
}
-static void __add_event_to_tracers(struct ftrace_event_call *call)
+static void __add_event_to_tracers(struct trace_event_call *call)
{
struct trace_array *tr;
@@ -2289,8 +2421,8 @@ static void __add_event_to_tracers(struct ftrace_event_call *call)
__trace_add_new_event(call, tr);
}
-extern struct ftrace_event_call *__start_ftrace_events[];
-extern struct ftrace_event_call *__stop_ftrace_events[];
+extern struct trace_event_call *__start_ftrace_events[];
+extern struct trace_event_call *__stop_ftrace_events[];
static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
@@ -2311,16 +2443,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
struct dentry *d_events;
struct dentry *entry;
- entry = debugfs_create_file("set_event", 0644, parent,
+ entry = tracefs_create_file("set_event", 0644, parent,
tr, &ftrace_set_event_fops);
if (!entry) {
- pr_warn("Could not create debugfs 'set_event' entry\n");
+ pr_warn("Could not create tracefs 'set_event' entry\n");
return -ENOMEM;
}
- d_events = debugfs_create_dir("events", parent);
+ d_events = tracefs_create_dir("events", parent);
if (!d_events) {
- pr_warn("Could not create debugfs 'events' directory\n");
+ pr_warn("Could not create tracefs 'events' directory\n");
return -ENOMEM;
}
@@ -2412,7 +2544,7 @@ int event_trace_del_tracer(struct trace_array *tr)
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
- debugfs_remove_recursive(tr->event_dir);
+ tracefs_remove_recursive(tr->event_dir);
up_write(&trace_event_sem);
tr->event_dir = NULL;
@@ -2425,7 +2557,7 @@ int event_trace_del_tracer(struct trace_array *tr)
static __init int event_trace_memsetup(void)
{
field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
- file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
+ file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC);
return 0;
}
@@ -2461,7 +2593,7 @@ early_enable_events(struct trace_array *tr, bool disable_first)
static __init int event_trace_enable(void)
{
struct trace_array *tr = top_trace_array();
- struct ftrace_event_call **iter, *call;
+ struct trace_event_call **iter, *call;
int ret;
if (!tr)
@@ -2534,10 +2666,10 @@ static __init int event_trace_init(void)
if (IS_ERR(d_tracer))
return 0;
- entry = debugfs_create_file("available_events", 0444, d_tracer,
+ entry = tracefs_create_file("available_events", 0444, d_tracer,
tr, &ftrace_avail_fops);
if (!entry)
- pr_warn("Could not create debugfs 'available_events' entry\n");
+ pr_warn("Could not create tracefs 'available_events' entry\n");
if (trace_define_common_fields())
pr_warn("tracing: Failed to allocate common fields");
@@ -2622,9 +2754,9 @@ static __init void event_test_stuff(void)
*/
static __init void event_trace_self_tests(void)
{
- struct ftrace_subsystem_dir *dir;
- struct ftrace_event_file *file;
- struct ftrace_event_call *call;
+ struct trace_subsystem_dir *dir;
+ struct trace_event_file *file;
+ struct trace_event_call *call;
struct event_subsystem *system;
struct trace_array *tr;
int ret;
@@ -2655,13 +2787,13 @@ static __init void event_trace_self_tests(void)
continue;
#endif
- pr_info("Testing event %s: ", ftrace_event_name(call));
+ pr_info("Testing event %s: ", trace_event_name(call));
/*
* If an event is already enabled, someone is using
* it and the self test should not be on.
*/
- if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+ if (file->flags & EVENT_FILE_FL_ENABLED) {
pr_warn("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ced69da0ff55..d81d6f302b14 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -643,7 +643,7 @@ static void append_filter_err(struct filter_parse_state *ps,
free_page((unsigned long) buf);
}
-static inline struct event_filter *event_filter(struct ftrace_event_file *file)
+static inline struct event_filter *event_filter(struct trace_event_file *file)
{
if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
return file->event_call->filter;
@@ -652,7 +652,7 @@ static inline struct event_filter *event_filter(struct ftrace_event_file *file)
}
/* caller must hold event_mutex */
-void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)
+void print_event_filter(struct trace_event_file *file, struct trace_seq *s)
{
struct event_filter *filter = event_filter(file);
@@ -780,14 +780,14 @@ static void __free_preds(struct event_filter *filter)
filter->n_preds = 0;
}
-static void filter_disable(struct ftrace_event_file *file)
+static void filter_disable(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
call->flags &= ~TRACE_EVENT_FL_FILTERED;
else
- file->flags &= ~FTRACE_EVENT_FL_FILTERED;
+ file->flags &= ~EVENT_FILE_FL_FILTERED;
}
static void __free_filter(struct event_filter *filter)
@@ -837,9 +837,9 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
return 0;
}
-static inline void __remove_filter(struct ftrace_event_file *file)
+static inline void __remove_filter(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
filter_disable(file);
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
@@ -848,10 +848,10 @@ static inline void __remove_filter(struct ftrace_event_file *file)
remove_filter_string(file->filter);
}
-static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
+static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
@@ -860,9 +860,9 @@ static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
}
}
-static inline void __free_subsystem_filter(struct ftrace_event_file *file)
+static inline void __free_subsystem_filter(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
__free_filter(call->filter);
@@ -873,10 +873,10 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)
}
}
-static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
+static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
@@ -1056,6 +1056,9 @@ static void parse_init(struct filter_parse_state *ps,
static char infix_next(struct filter_parse_state *ps)
{
+ if (!ps->infix.cnt)
+ return 0;
+
ps->infix.cnt--;
return ps->infix.string[ps->infix.tail++];
@@ -1071,6 +1074,9 @@ static char infix_peek(struct filter_parse_state *ps)
static void infix_advance(struct filter_parse_state *ps)
{
+ if (!ps->infix.cnt)
+ return;
+
ps->infix.cnt--;
ps->infix.tail++;
}
@@ -1336,7 +1342,7 @@ parse_operand:
}
static struct filter_pred *create_pred(struct filter_parse_state *ps,
- struct ftrace_event_call *call,
+ struct trace_event_call *call,
int op, char *operand1, char *operand2)
{
struct ftrace_event_field *field;
@@ -1369,19 +1375,28 @@ static int check_preds(struct filter_parse_state *ps)
{
int n_normal_preds = 0, n_logical_preds = 0;
struct postfix_elt *elt;
+ int cnt = 0;
list_for_each_entry(elt, &ps->postfix, list) {
- if (elt->op == OP_NONE)
+ if (elt->op == OP_NONE) {
+ cnt++;
continue;
+ }
if (elt->op == OP_AND || elt->op == OP_OR) {
n_logical_preds++;
+ cnt--;
continue;
}
+ if (elt->op != OP_NOT)
+ cnt--;
n_normal_preds++;
+ /* all ops should have operands */
+ if (cnt < 0)
+ break;
}
- if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+ if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) {
parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
return -EINVAL;
}
@@ -1549,7 +1564,7 @@ static int fold_pred_tree(struct event_filter *filter,
filter->preds);
}
-static int replace_preds(struct ftrace_event_call *call,
+static int replace_preds(struct trace_event_call *call,
struct event_filter *filter,
struct filter_parse_state *ps,
bool dry_run)
@@ -1662,20 +1677,20 @@ fail:
return err;
}
-static inline void event_set_filtered_flag(struct ftrace_event_file *file)
+static inline void event_set_filtered_flag(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
call->flags |= TRACE_EVENT_FL_FILTERED;
else
- file->flags |= FTRACE_EVENT_FL_FILTERED;
+ file->flags |= EVENT_FILE_FL_FILTERED;
}
-static inline void event_set_filter(struct ftrace_event_file *file,
+static inline void event_set_filter(struct trace_event_file *file,
struct event_filter *filter)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
rcu_assign_pointer(call->filter, filter);
@@ -1683,9 +1698,9 @@ static inline void event_set_filter(struct ftrace_event_file *file,
rcu_assign_pointer(file->filter, filter);
}
-static inline void event_clear_filter(struct ftrace_event_file *file)
+static inline void event_clear_filter(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
RCU_INIT_POINTER(call->filter, NULL);
@@ -1694,33 +1709,33 @@ static inline void event_clear_filter(struct ftrace_event_file *file)
}
static inline void
-event_set_no_set_filter_flag(struct ftrace_event_file *file)
+event_set_no_set_filter_flag(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
else
- file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER;
+ file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
}
static inline void
-event_clear_no_set_filter_flag(struct ftrace_event_file *file)
+event_clear_no_set_filter_flag(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
else
- file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER;
+ file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
}
static inline bool
-event_no_set_filter_flag(struct ftrace_event_file *file)
+event_no_set_filter_flag(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
- if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
+ if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
return true;
if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
@@ -1735,12 +1750,12 @@ struct filter_list {
struct event_filter *filter;
};
-static int replace_system_preds(struct ftrace_subsystem_dir *dir,
+static int replace_system_preds(struct trace_subsystem_dir *dir,
struct trace_array *tr,
struct filter_parse_state *ps,
char *filter_string)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct filter_list *filter_item;
struct filter_list *tmp;
LIST_HEAD(filter_list);
@@ -1884,8 +1899,8 @@ static void create_filter_finish(struct filter_parse_state *ps)
}
/**
- * create_filter - create a filter for a ftrace_event_call
- * @call: ftrace_event_call to create a filter for
+ * create_filter - create a filter for a trace_event_call
+ * @call: trace_event_call to create a filter for
* @filter_str: filter string
* @set_str: remember @filter_str and enable detailed error in filter
* @filterp: out param for created filter (always updated on return)
@@ -1899,7 +1914,7 @@ static void create_filter_finish(struct filter_parse_state *ps)
* information if @set_str is %true and the caller is responsible for
* freeing it.
*/
-static int create_filter(struct ftrace_event_call *call,
+static int create_filter(struct trace_event_call *call,
char *filter_str, bool set_str,
struct event_filter **filterp)
{
@@ -1919,7 +1934,7 @@ static int create_filter(struct ftrace_event_call *call,
return err;
}
-int create_event_filter(struct ftrace_event_call *call,
+int create_event_filter(struct trace_event_call *call,
char *filter_str, bool set_str,
struct event_filter **filterp)
{
@@ -1935,7 +1950,7 @@ int create_event_filter(struct ftrace_event_call *call,
* Identical to create_filter() except that it creates a subsystem filter
* and always remembers @filter_str.
*/
-static int create_system_filter(struct ftrace_subsystem_dir *dir,
+static int create_system_filter(struct trace_subsystem_dir *dir,
struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
@@ -1961,9 +1976,9 @@ static int create_system_filter(struct ftrace_subsystem_dir *dir,
}
/* caller must hold event_mutex */
-int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
+int apply_event_filter(struct trace_event_file *file, char *filter_string)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
struct event_filter *filter;
int err;
@@ -2012,7 +2027,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
return err;
}
-int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
+int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
char *filter_string)
{
struct event_subsystem *system = dir->subsystem;
@@ -2075,7 +2090,7 @@ struct function_filter_data {
static char **
ftrace_function_filter_re(char *buf, int len, int *count)
{
- char *str, *sep, **re;
+ char *str, **re;
str = kstrndup(buf, len, GFP_KERNEL);
if (!str)
@@ -2085,8 +2100,7 @@ ftrace_function_filter_re(char *buf, int len, int *count)
* The argv_split function takes white space
* as a separator, so convert ',' into spaces.
*/
- while ((sep = strchr(str, ',')))
- *sep = ' ';
+ strreplace(str, ',', ' ');
re = argv_split(GFP_KERNEL, str, count);
kfree(str);
@@ -2212,7 +2226,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
{
int err;
struct event_filter *filter;
- struct ftrace_event_call *call;
+ struct trace_event_call *call;
mutex_lock(&event_mutex);
@@ -2268,7 +2282,7 @@ out_unlock:
static struct test_filter_data_t {
char *filter;
- struct ftrace_raw_ftrace_test_filter rec;
+ struct trace_event_raw_ftrace_test_filter rec;
int match;
char *not_visited;
} test_filter_data[] = {
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8712df9decb4..42a4009fd75a 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -40,7 +40,7 @@ trigger_data_free(struct event_trigger_data *data)
/**
* event_triggers_call - Call triggers associated with a trace event
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
* @rec: The trace entry for the event, NULL for unconditional invocation
*
* For each trigger associated with an event, invoke the trigger
@@ -63,7 +63,7 @@ trigger_data_free(struct event_trigger_data *data)
* any trigger that should be deferred, ETT_NONE if nothing to defer.
*/
enum event_trigger_type
-event_triggers_call(struct ftrace_event_file *file, void *rec)
+event_triggers_call(struct trace_event_file *file, void *rec)
{
struct event_trigger_data *data;
enum event_trigger_type tt = ETT_NONE;
@@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
/**
* event_triggers_post_call - Call 'post_triggers' for a trace event
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
* @tt: enum event_trigger_type containing a set bit for each trigger to invoke
*
* For each trigger associated with an event, invoke the trigger
@@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
* Called from tracepoint handlers (with rcu_read_lock_sched() held).
*/
void
-event_triggers_post_call(struct ftrace_event_file *file,
+event_triggers_post_call(struct trace_event_file *file,
enum event_trigger_type tt)
{
struct event_trigger_data *data;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(event_triggers_post_call);
static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
{
- struct ftrace_event_file *event_file = event_file_data(m->private);
+ struct trace_event_file *event_file = event_file_data(m->private);
if (t == SHOW_AVAILABLE_TRIGGERS)
return NULL;
@@ -129,7 +129,7 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
static void *trigger_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_file *event_file;
+ struct trace_event_file *event_file;
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
@@ -201,7 +201,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
return ret;
}
-static int trigger_process_regex(struct ftrace_event_file *file, char *buff)
+static int trigger_process_regex(struct trace_event_file *file, char *buff)
{
char *command, *next = buff;
struct event_command *p;
@@ -227,7 +227,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct ftrace_event_file *event_file;
+ struct trace_event_file *event_file;
ssize_t ret;
char *buf;
@@ -430,7 +430,7 @@ event_trigger_free(struct event_trigger_ops *ops,
trigger_data_free(data);
}
-static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
+static int trace_event_trigger_enable_disable(struct trace_event_file *file,
int trigger_enable)
{
int ret = 0;
@@ -438,12 +438,12 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
if (trigger_enable) {
if (atomic_inc_return(&file->tm_ref) > 1)
return ret;
- set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags);
ret = trace_event_enable_disable(file, 1, 1);
} else {
if (atomic_dec_return(&file->tm_ref) > 0)
return ret;
- clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags);
ret = trace_event_enable_disable(file, 0, 1);
}
@@ -466,7 +466,7 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
void
clear_event_triggers(struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list) {
struct event_trigger_data *data;
@@ -480,7 +480,7 @@ clear_event_triggers(struct trace_array *tr)
/**
* update_cond_flag - Set or reset the TRIGGER_COND bit
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
*
* If an event has triggers and any of those triggers has a filter or
* a post_trigger, trigger invocation needs to be deferred until after
@@ -488,7 +488,7 @@ clear_event_triggers(struct trace_array *tr)
* its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be
* cleared.
*/
-static void update_cond_flag(struct ftrace_event_file *file)
+static void update_cond_flag(struct trace_event_file *file)
{
struct event_trigger_data *data;
bool set_cond = false;
@@ -501,9 +501,9 @@ static void update_cond_flag(struct ftrace_event_file *file)
}
if (set_cond)
- set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags);
else
- clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags);
}
/**
@@ -511,7 +511,7 @@ static void update_cond_flag(struct ftrace_event_file *file)
* @glob: The raw string used to register the trigger
* @ops: The trigger ops associated with the trigger
* @data: Trigger-specific data to associate with the trigger
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
*
* Common implementation for event trigger registration.
*
@@ -522,7 +522,7 @@ static void update_cond_flag(struct ftrace_event_file *file)
*/
static int register_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
struct event_trigger_data *test;
int ret = 0;
@@ -557,7 +557,7 @@ out:
* @glob: The raw string used to register the trigger
* @ops: The trigger ops associated with the trigger
* @test: Trigger-specific data used to find the trigger to remove
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
*
* Common implementation for event trigger unregistration.
*
@@ -566,7 +566,7 @@ out:
*/
static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *test,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
struct event_trigger_data *data;
bool unregistered = false;
@@ -588,7 +588,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
/**
* event_trigger_callback - Generic event_command @func implementation
* @cmd_ops: The command ops, used for trigger registration
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
* @glob: The raw string used to register the trigger
* @cmd: The cmd portion of the string used to register the trigger
* @param: The params portion of the string used to register the trigger
@@ -603,7 +603,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
*/
static int
event_trigger_callback(struct event_command *cmd_ops,
- struct ftrace_event_file *file,
+ struct trace_event_file *file,
char *glob, char *cmd, char *param)
{
struct event_trigger_data *trigger_data;
@@ -688,7 +688,7 @@ event_trigger_callback(struct event_command *cmd_ops,
* set_trigger_filter - Generic event_command @set_filter implementation
* @filter_str: The filter string for the trigger, NULL to remove filter
* @trigger_data: Trigger-specific data
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
*
* Common implementation for event command filter parsing and filter
* instantiation.
@@ -702,7 +702,7 @@ event_trigger_callback(struct event_command *cmd_ops,
*/
static int set_trigger_filter(char *filter_str,
struct event_trigger_data *trigger_data,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
struct event_trigger_data *data = trigger_data;
struct event_filter *filter = NULL, *tmp;
@@ -900,7 +900,7 @@ snapshot_count_trigger(struct event_trigger_data *data)
static int
register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
int ret = register_trigger(glob, ops, data, file);
@@ -968,7 +968,7 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
* Skip 3:
* stacktrace_trigger()
* event_triggers_post_call()
- * ftrace_raw_event_xxx()
+ * trace_event_raw_event_xxx()
*/
#define STACK_SKIP 3
@@ -1053,7 +1053,7 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
#define DISABLE_EVENT_STR "disable_event"
struct enable_trigger_data {
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
bool enable;
};
@@ -1063,9 +1063,9 @@ event_enable_trigger(struct event_trigger_data *data)
struct enable_trigger_data *enable_data = data->private_data;
if (enable_data->enable)
- clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+ clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
else
- set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+ set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
}
static void
@@ -1077,7 +1077,7 @@ event_enable_count_trigger(struct event_trigger_data *data)
return;
/* Skip if the event is in a state we want to switch to */
- if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
return;
if (data->count != -1)
@@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
seq_printf(m, "%s:%s:%s",
enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
enable_data->file->event_call->class->system,
- ftrace_event_name(enable_data->file->event_call));
+ trace_event_name(enable_data->file->event_call));
if (data->count == -1)
seq_puts(m, ":unlimited");
@@ -1159,10 +1159,10 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
static int
event_enable_trigger_func(struct event_command *cmd_ops,
- struct ftrace_event_file *file,
+ struct trace_event_file *file,
char *glob, char *cmd, char *param)
{
- struct ftrace_event_file *event_enable_file;
+ struct trace_event_file *event_enable_file;
struct enable_trigger_data *enable_data;
struct event_trigger_data *trigger_data;
struct event_trigger_ops *trigger_ops;
@@ -1294,7 +1294,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
static int event_enable_register_trigger(char *glob,
struct event_trigger_ops *ops,
struct event_trigger_data *data,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
struct enable_trigger_data *enable_data = data->private_data;
struct enable_trigger_data *test_enable_data;
@@ -1331,7 +1331,7 @@ out:
static void event_enable_unregister_trigger(char *glob,
struct event_trigger_ops *ops,
struct event_trigger_data *test,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
struct enable_trigger_data *test_enable_data = test->private_data;
struct enable_trigger_data *enable_data;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 12e2b99be862..adabf7da9113 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,7 +125,7 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
static int __init \
-ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
+ftrace_define_fields_##name(struct trace_event_call *event_call) \
{ \
struct struct_name field; \
int ret; \
@@ -163,23 +163,23 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
regfn) \
\
-struct ftrace_event_class __refdata event_class_ftrace_##call = { \
+struct trace_event_class __refdata event_class_ftrace_##call = { \
.system = __stringify(TRACE_SYSTEM), \
.define_fields = ftrace_define_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
.reg = regfn, \
}; \
\
-struct ftrace_event_call __used event_##call = { \
+struct trace_event_call __used event_##call = { \
.class = &event_class_ftrace_##call, \
{ \
.name = #call, \
}, \
.event.type = etype, \
.print_fmt = print, \
- .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
+ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
}; \
-struct ftrace_event_call __used \
+struct trace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
#undef FTRACE_ENTRY
@@ -187,7 +187,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
FTRACE_ENTRY_REG(call, struct_name, etype, \
PARAMS(tstruct), PARAMS(print), filter, NULL)
-int ftrace_event_is_function(struct ftrace_event_call *call)
+int ftrace_event_is_function(struct trace_event_call *call)
{
return call == &event_function;
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d25ad1526bb..8968bf720c12 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -6,7 +6,6 @@
* is Copyright (c) Steven Rostedt <srostedt@redhat.com>
*
*/
-#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
@@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
* The curr_ret_stack is initialized to -1 and get increased
* in this function. So it can be less than -1 only if it was
* filtered out via ftrace_graph_notrace_addr() which can be
- * set from set_graph_notrace file in debugfs by user.
+ * set from set_graph_notrace file in tracefs by user.
*/
if (current->curr_ret_stack < -1)
return -EBUSY;
@@ -279,7 +278,7 @@ int __trace_graph_entry(struct trace_array *tr,
unsigned long flags,
int pc)
{
- struct ftrace_event_call *call = &event_funcgraph_entry;
+ struct trace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
@@ -394,7 +393,7 @@ void __trace_graph_return(struct trace_array *tr,
unsigned long flags,
int pc)
{
- struct ftrace_event_call *call = &event_funcgraph_exit;
+ struct trace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
@@ -1309,15 +1308,19 @@ void graph_trace_open(struct trace_iterator *iter)
{
/* pid and depth on the last trace processed */
struct fgraph_data *data;
+ gfp_t gfpflags;
int cpu;
iter->private = NULL;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ /* We can be called in atomic context via ftrace_dump() */
+ gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
+
+ data = kzalloc(sizeof(*data), gfpflags);
if (!data)
goto out_err;
- data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
+ data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags);
if (!data->cpu_data)
goto out_err_free;
@@ -1432,7 +1435,7 @@ static const struct file_operations graph_depth_fops = {
.llseek = generic_file_llseek,
};
-static __init int init_graph_debugfs(void)
+static __init int init_graph_tracefs(void)
{
struct dentry *d_tracer;
@@ -1445,18 +1448,18 @@ static __init int init_graph_debugfs(void)
return 0;
}
-fs_initcall(init_graph_debugfs);
+fs_initcall(init_graph_tracefs);
static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
- if (!register_ftrace_event(&graph_trace_entry_event)) {
+ if (!register_trace_event(&graph_trace_entry_event)) {
pr_warning("Warning: could not register graph trace events\n");
return 1;
}
- if (!register_ftrace_event(&graph_trace_ret_event)) {
+ if (!register_trace_event(&graph_trace_ret_event)) {
pr_warning("Warning: could not register graph trace events\n");
return 1;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
void *addr, void *dest)
{
- long ret;
int maxlen = get_rloc_len(*(u32 *)dest);
u8 *dst = get_rloc_data(dest);
- u8 *src = addr;
- mm_segment_t old_fs = get_fs();
+ long ret;
if (!maxlen)
return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
* Try to get string again, since the string can be changed while
* probing.
*/
- set_fs(KERNEL_DS);
- pagefault_disable();
-
- do
- ret = __copy_from_user_inatomic(dst++, src++, 1);
- while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-
- dst[-1] = '\0';
- pagefault_enable();
- set_fs(old_fs);
+ ret = strncpy_from_unsafe(dst, addr, maxlen);
if (ret < 0) { /* Failed to fetch string */
- ((u8 *)get_rloc_data(dest))[0] = '\0';
+ dst[0] = '\0';
*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
} else {
- *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
- get_rloc_offs(*(u32 *)dest));
+ *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
}
}
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
@@ -250,7 +238,7 @@ DEFINE_FETCH_symbol(string_size)
#define fetch_file_offset_string_size NULL
/* Fetch type information table */
-const struct fetch_type kprobes_fetch_type_table[] = {
+static const struct fetch_type kprobes_fetch_type_table[] = {
/* Special types */
[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
sizeof(u32), 1, "__data_loc char[]"),
@@ -348,7 +336,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
struct trace_kprobe *tk;
list_for_each_entry(tk, &probe_list, list)
- if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 &&
+ if (strcmp(trace_event_name(&tk->tp.call), event) == 0 &&
strcmp(tk->tp.call.class->system, group) == 0)
return tk;
return NULL;
@@ -359,7 +347,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
* if the file is NULL, enable "perf" handler, or enable "trace" handler.
*/
static int
-enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
+enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
{
int ret = 0;
@@ -394,7 +382,7 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
* if the file is NULL, disable "perf" handler, or disable "trace" handler.
*/
static int
-disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
+disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
{
struct event_file_link *link = NULL;
int wait = 0;
@@ -523,7 +511,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
mutex_lock(&probe_lock);
/* Delete old (same name) event if exist */
- old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call),
+ old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call),
tk->tp.call.class->system);
if (old_tk) {
ret = unregister_trace_kprobe(old_tk);
@@ -572,7 +560,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
if (ret)
pr_warning("Failed to re-register probe %s on"
"%s: %d\n",
- ftrace_event_name(&tk->tp.call),
+ trace_event_name(&tk->tp.call),
mod->name, ret);
}
}
@@ -760,7 +748,8 @@ static int create_trace_kprobe(int argc, char **argv)
/* Parse fetch argument */
ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
- is_return, true);
+ is_return, true,
+ kprobes_fetch_type_table);
if (ret) {
pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
goto error;
@@ -828,7 +817,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
seq_printf(m, ":%s/%s", tk->tp.call.class->system,
- ftrace_event_name(&tk->tp.call));
+ trace_event_name(&tk->tp.call));
if (!tk->symbol)
seq_printf(m, " 0x%p", tk->rp.kp.addr);
@@ -887,7 +876,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
struct trace_kprobe *tk = v;
seq_printf(m, " %-44s %15lu %15lu\n",
- ftrace_event_name(&tk->tp.call), tk->nhit,
+ trace_event_name(&tk->tp.call), tk->nhit,
tk->rp.kp.nmissed);
return 0;
@@ -916,18 +905,18 @@ static const struct file_operations kprobe_profile_ops = {
/* Kprobe handler */
static nokprobe_inline void
__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
- struct ftrace_event_file *ftrace_file)
+ struct trace_event_file *trace_file)
{
struct kprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
int size, dsize, pc;
unsigned long irq_flags;
- struct ftrace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = &tk->tp.call;
- WARN_ON(call != ftrace_file->event_call);
+ WARN_ON(call != trace_file->event_call);
- if (ftrace_trigger_soft_disabled(ftrace_file))
+ if (trace_trigger_soft_disabled(trace_file))
return;
local_save_flags(irq_flags);
@@ -936,7 +925,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
dsize = __get_data_size(&tk->tp, regs);
size = sizeof(*entry) + tk->tp.size + dsize;
- event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ event = trace_event_buffer_lock_reserve(&buffer, trace_file,
call->event.type,
size, irq_flags, pc);
if (!event)
@@ -946,7 +935,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
entry->ip = (unsigned long)tk->rp.kp.addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
- event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+ event_trigger_unlock_commit_regs(trace_file, buffer, event,
entry, irq_flags, pc, regs);
}
@@ -964,18 +953,18 @@ NOKPROBE_SYMBOL(kprobe_trace_func);
static nokprobe_inline void
__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs,
- struct ftrace_event_file *ftrace_file)
+ struct trace_event_file *trace_file)
{
struct kretprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
int size, pc, dsize;
unsigned long irq_flags;
- struct ftrace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = &tk->tp.call;
- WARN_ON(call != ftrace_file->event_call);
+ WARN_ON(call != trace_file->event_call);
- if (ftrace_trigger_soft_disabled(ftrace_file))
+ if (trace_trigger_soft_disabled(trace_file))
return;
local_save_flags(irq_flags);
@@ -984,7 +973,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
dsize = __get_data_size(&tk->tp, regs);
size = sizeof(*entry) + tk->tp.size + dsize;
- event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ event = trace_event_buffer_lock_reserve(&buffer, trace_file,
call->event.type,
size, irq_flags, pc);
if (!event)
@@ -995,7 +984,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
- event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+ event_trigger_unlock_commit_regs(trace_file, buffer, event,
entry, irq_flags, pc, regs);
}
@@ -1024,7 +1013,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
+ trace_seq_printf(s, "%s: (", trace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
goto out;
@@ -1055,7 +1044,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
+ trace_seq_printf(s, "%s: (", trace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
goto out;
@@ -1080,7 +1069,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
}
-static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int kprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret, i;
struct kprobe_trace_entry_head field;
@@ -1103,7 +1092,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
return 0;
}
-static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int kretprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret, i;
struct kretprobe_trace_entry_head field;
@@ -1133,12 +1122,16 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
- struct ftrace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = &tk->tp.call;
+ struct bpf_prog *prog = call->prog;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
@@ -1164,12 +1157,16 @@ static void
kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
- struct ftrace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = &tk->tp.call;
+ struct bpf_prog *prog = call->prog;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
@@ -1197,11 +1194,11 @@ NOKPROBE_SYMBOL(kretprobe_perf_func);
* kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
* lockless, but we can't race with this __init function.
*/
-static int kprobe_register(struct ftrace_event_call *event,
+static int kprobe_register(struct trace_event_call *event,
enum trace_reg type, void *data)
{
struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
- struct ftrace_event_file *file = data;
+ struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
@@ -1267,10 +1264,10 @@ static struct trace_event_functions kprobe_funcs = {
static int register_kprobe_event(struct trace_kprobe *tk)
{
- struct ftrace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = &tk->tp.call;
int ret;
- /* Initialize ftrace_event_call */
+ /* Initialize trace_event_call */
INIT_LIST_HEAD(&call->class->fields);
if (trace_kprobe_is_return(tk)) {
call->event.funcs = &kretprobe_funcs;
@@ -1281,20 +1278,20 @@ static int register_kprobe_event(struct trace_kprobe *tk)
}
if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
return -ENOMEM;
- ret = register_ftrace_event(&call->event);
+ ret = register_trace_event(&call->event);
if (!ret) {
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = 0;
+ call->flags = TRACE_EVENT_FL_KPROBE;
call->class->reg = kprobe_register;
call->data = tk;
ret = trace_add_event_call(call);
if (ret) {
pr_info("Failed to register kprobe event: %s\n",
- ftrace_event_name(call));
+ trace_event_name(call));
kfree(call->print_fmt);
- unregister_ftrace_event(&call->event);
+ unregister_trace_event(&call->event);
}
return ret;
}
@@ -1310,7 +1307,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
return ret;
}
-/* Make a debugfs interface for controlling probe points */
+/* Make a tracefs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
struct dentry *d_tracer;
@@ -1323,20 +1320,20 @@ static __init int init_kprobe_trace(void)
if (IS_ERR(d_tracer))
return 0;
- entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+ entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
NULL, &kprobe_events_ops);
/* Event list interface */
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'kprobe_events' entry\n");
/* Profile interface */
- entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
NULL, &kprobe_profile_ops);
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'kprobe_profile' entry\n");
return 0;
}
@@ -1355,10 +1352,10 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
return a1 + a2 + a3 + a4 + a5 + a6;
}
-static struct ftrace_event_file *
+static struct trace_event_file *
find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list)
if (file->event_call == &tk->tp.call)
@@ -1376,7 +1373,7 @@ static __init int kprobe_trace_self_tests_init(void)
int ret, warn = 0;
int (*target)(int, int, int, int, int, int);
struct trace_kprobe *tk;
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
if (tracing_is_disabled())
return -ENODEV;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 7a9ba62e9fef..638e110c5bfd 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -298,7 +298,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_rw *rw)
{
- struct ftrace_event_call *call = &event_mmiotrace_rw;
+ struct trace_event_call *call = &event_mmiotrace_rw;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
@@ -328,7 +328,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_map *map)
{
- struct ftrace_event_call *call = &event_mmiotrace_map;
+ struct trace_event_call *call = &event_mmiotrace_map;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 692bf7184c8c..dfab253727dc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -60,9 +60,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
}
const char *
-ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
- unsigned long flags,
- const struct trace_print_flags *flag_array)
+trace_print_flags_seq(struct trace_seq *p, const char *delim,
+ unsigned long flags,
+ const struct trace_print_flags *flag_array)
{
unsigned long mask;
const char *str;
@@ -95,11 +95,11 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
return ret;
}
-EXPORT_SYMBOL(ftrace_print_flags_seq);
+EXPORT_SYMBOL(trace_print_flags_seq);
const char *
-ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
- const struct trace_print_flags *symbol_array)
+trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+ const struct trace_print_flags *symbol_array)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
@@ -120,11 +120,11 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
return ret;
}
-EXPORT_SYMBOL(ftrace_print_symbols_seq);
+EXPORT_SYMBOL(trace_print_symbols_seq);
#if BITS_PER_LONG == 32
const char *
-ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
const struct trace_print_flags_u64 *symbol_array)
{
int i;
@@ -146,12 +146,12 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
return ret;
}
-EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+EXPORT_SYMBOL(trace_print_symbols_seq_u64);
#endif
const char *
-ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
- unsigned int bitmask_size)
+trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+ unsigned int bitmask_size)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -160,10 +160,10 @@ ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
return ret;
}
-EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
+EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
const char *
-ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
@@ -175,15 +175,16 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
return ret;
}
-EXPORT_SYMBOL(ftrace_print_hex_seq);
+EXPORT_SYMBOL(trace_print_hex_seq);
const char *
-ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
- size_t el_size)
+trace_print_array_seq(struct trace_seq *p, const void *buf, int count,
+ size_t el_size)
{
const char *ret = trace_seq_buffer_ptr(p);
const char *prefix = "";
void *ptr = (void *)buf;
+ size_t buf_len = count * el_size;
trace_seq_putc(p, '{');
@@ -219,17 +220,17 @@ ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
return ret;
}
-EXPORT_SYMBOL(ftrace_print_array_seq);
+EXPORT_SYMBOL(trace_print_array_seq);
-int ftrace_raw_output_prep(struct trace_iterator *iter,
- struct trace_event *trace_event)
+int trace_raw_output_prep(struct trace_iterator *iter,
+ struct trace_event *trace_event)
{
- struct ftrace_event_call *event;
+ struct trace_event_call *event;
struct trace_seq *s = &iter->seq;
struct trace_seq *p = &iter->tmp_seq;
struct trace_entry *entry;
- event = container_of(trace_event, struct ftrace_event_call, event);
+ event = container_of(trace_event, struct trace_event_call, event);
entry = iter->ent;
if (entry->type != event->event.type) {
@@ -238,14 +239,14 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
}
trace_seq_init(p);
- trace_seq_printf(s, "%s: ", ftrace_event_name(event));
+ trace_seq_printf(s, "%s: ", trace_event_name(event));
return trace_handle_return(s);
}
-EXPORT_SYMBOL(ftrace_raw_output_prep);
+EXPORT_SYMBOL(trace_raw_output_prep);
-static int ftrace_output_raw(struct trace_iterator *iter, char *name,
- char *fmt, va_list ap)
+static int trace_output_raw(struct trace_iterator *iter, char *name,
+ char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
@@ -255,18 +256,18 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
return trace_handle_return(s);
}
-int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
+int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
{
va_list ap;
int ret;
va_start(ap, fmt);
- ret = ftrace_output_raw(iter, name, fmt, ap);
+ ret = trace_output_raw(iter, name, fmt, ap);
va_end(ap);
return ret;
}
-EXPORT_SYMBOL_GPL(ftrace_output_call);
+EXPORT_SYMBOL_GPL(trace_output_call);
#ifdef CONFIG_KRETPROBES
static inline const char *kretprobed(const char *name)
@@ -674,7 +675,7 @@ static int trace_search_list(struct list_head **list)
}
/* Did we used up all 65 thousand events??? */
- if ((last + 1) > FTRACE_MAX_EVENT)
+ if ((last + 1) > TRACE_EVENT_TYPE_MAX)
return 0;
*list = &e->list;
@@ -692,7 +693,7 @@ void trace_event_read_unlock(void)
}
/**
- * register_ftrace_event - register output for an event type
+ * register_trace_event - register output for an event type
* @event: the event type to register
*
* Event types are stored in a hash and this hash is used to
@@ -706,7 +707,7 @@ void trace_event_read_unlock(void)
*
* Returns the event type number or zero on error.
*/
-int register_ftrace_event(struct trace_event *event)
+int register_trace_event(struct trace_event *event)
{
unsigned key;
int ret = 0;
@@ -724,7 +725,7 @@ int register_ftrace_event(struct trace_event *event)
if (!event->type) {
struct list_head *list = NULL;
- if (next_event_type > FTRACE_MAX_EVENT) {
+ if (next_event_type > TRACE_EVENT_TYPE_MAX) {
event->type = trace_search_list(&list);
if (!event->type)
@@ -770,12 +771,12 @@ int register_ftrace_event(struct trace_event *event)
return ret;
}
-EXPORT_SYMBOL_GPL(register_ftrace_event);
+EXPORT_SYMBOL_GPL(register_trace_event);
/*
* Used by module code with the trace_event_sem held for write.
*/
-int __unregister_ftrace_event(struct trace_event *event)
+int __unregister_trace_event(struct trace_event *event)
{
hlist_del(&event->node);
list_del(&event->list);
@@ -783,18 +784,18 @@ int __unregister_ftrace_event(struct trace_event *event)
}
/**
- * unregister_ftrace_event - remove a no longer used event
+ * unregister_trace_event - remove a no longer used event
* @event: the event to remove
*/
-int unregister_ftrace_event(struct trace_event *event)
+int unregister_trace_event(struct trace_event *event)
{
down_write(&trace_event_sem);
- __unregister_ftrace_event(event);
+ __unregister_trace_event(event);
up_write(&trace_event_sem);
return 0;
}
-EXPORT_SYMBOL_GPL(unregister_ftrace_event);
+EXPORT_SYMBOL_GPL(unregister_trace_event);
/*
* Standard events
@@ -1242,7 +1243,7 @@ __init static int init_events(void)
for (i = 0; events[i]; i++) {
event = events[i];
- ret = register_ftrace_event(event);
+ ret = register_trace_event(event);
if (!ret) {
printk(KERN_WARNING "event %d failed to register\n",
event->type);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 8ef2c40efb3c..4cbfe85b99c8 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -32,7 +32,7 @@ extern int
trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
/* used by module unregistering */
-extern int __unregister_ftrace_event(struct trace_event *event);
+extern int __unregister_trace_event(struct trace_event *event);
extern struct rw_semaphore trace_event_sem;
#define SEQ_PUT_FIELD(s, x) \
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index b983b2fd2ca1..1769a81da8a7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
/* Recursive argument parser */
static int parse_probe_arg(char *arg, const struct fetch_type *t,
- struct fetch_param *f, bool is_return, bool is_kprobe)
+ struct fetch_param *f, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl)
{
- const struct fetch_type *ftbl;
unsigned long param;
long offset;
char *tmp;
int ret = 0;
- ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
- BUG_ON(ftbl == NULL);
-
switch (arg[0]) {
case '$':
ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
@@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
dprm->fetch_size = get_fetch_size_function(t,
dprm->fetch, ftbl);
ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
- is_kprobe);
+ is_kprobe, ftbl);
if (ret)
kfree(dprm);
else {
@@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf,
/* String length checking wrapper */
int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
- struct probe_arg *parg, bool is_return, bool is_kprobe)
+ struct probe_arg *parg, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl)
{
- const struct fetch_type *ftbl;
const char *t;
int ret;
- ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table;
- BUG_ON(ftbl == NULL);
-
if (strlen(arg) > MAX_ARGSTR_LEN) {
pr_info("Argument is too long.: %s\n", arg);
return -ENOSPC;
@@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
}
parg->offset = *size;
*size += parg->type->size;
- ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
+ ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return,
+ is_kprobe, ftbl);
if (ret >= 0 && t != NULL)
ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4f815fbce16d..b98dee914542 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -25,7 +25,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/smp.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
@@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \
#define FETCH_TYPE_STRING 0
#define FETCH_TYPE_STRSIZE 1
-/*
- * Fetch type information table.
- * It's declared as a weak symbol due to conditional compilation.
- */
-extern __weak const struct fetch_type kprobes_fetch_type_table[];
-extern __weak const struct fetch_type uprobes_fetch_type_table[];
-
#ifdef CONFIG_KPROBE_EVENT
struct symbol_cache;
unsigned long update_symbol_cache(struct symbol_cache *sc);
@@ -279,8 +272,8 @@ struct probe_arg {
struct trace_probe {
unsigned int flags; /* For TP_FLAG_* */
- struct ftrace_event_class class;
- struct ftrace_event_call call;
+ struct trace_event_class class;
+ struct trace_event_call call;
struct list_head files;
ssize_t size; /* trace entry size */
unsigned int nr_args;
@@ -288,7 +281,7 @@ struct trace_probe {
};
struct event_file_link {
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct list_head list;
};
@@ -321,7 +314,7 @@ static inline int is_good_name(const char *name)
}
static inline struct event_file_link *
-find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
+find_event_file_link(struct trace_probe *tp, struct trace_event_file *file)
{
struct event_file_link *link;
@@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
}
extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
- struct probe_arg *parg, bool is_return, bool is_kprobe);
+ struct probe_arg *parg, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl);
extern int traceprobe_conflict_field_name(const char *name,
struct probe_arg *args, int narg);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 419ca37e72c9..f270088e9929 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -26,7 +26,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
}
static void
-probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee)
{
if (unlikely(!sched_ref))
return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index d6e1003724e9..12cbe77b4136 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -369,7 +369,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *next,
unsigned long flags, int pc)
{
- struct ftrace_event_call *call = &event_context_switch;
+ struct trace_event_call *call = &event_context_switch;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -397,7 +397,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *curr,
unsigned long flags, int pc)
{
- struct ftrace_event_call *call = &event_wakeup;
+ struct trace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
@@ -514,7 +514,7 @@ static void wakeup_reset(struct trace_array *tr)
}
static void
-probe_wakeup(void *ignore, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c3e4fcfddd45..3f34496244e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -327,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p)
local_irq_enable();
}
-static int trace_lookup_stack(struct seq_file *m, long i)
+static void trace_lookup_stack(struct seq_file *m, long i)
{
unsigned long addr = stack_dump_trace[i];
- return seq_printf(m, "%pS\n", (void *)addr);
+ seq_printf(m, "%pS\n", (void *)addr);
}
static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75e19e86c954..6cf935316769 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -12,7 +12,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include "trace_stat.h"
#include "trace.h"
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session)
static void destroy_session(struct stat_session *session)
{
- debugfs_remove(session->file);
+ tracefs_remove(session->file);
__reset_stat_session(session);
mutex_destroy(&session->stat_mutex);
kfree(session);
@@ -279,9 +279,9 @@ static int tracing_stat_init(void)
if (IS_ERR(d_tracing))
return 0;
- stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+ stat_dir = tracefs_create_dir("trace_stat", d_tracing);
if (!stat_dir)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'trace_stat' entry\n");
return 0;
}
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session)
if (!stat_dir && tracing_stat_init())
return -ENODEV;
- session->file = debugfs_create_file(session->ts->name, 0644,
+ session->file = tracefs_create_file(session->ts->name, 0644,
stat_dir,
session, &tracing_stat_fops);
if (!session->file)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f97f6e3a676c..7d567a4b9fa7 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -13,13 +13,13 @@
static DEFINE_MUTEX(syscall_trace_lock);
-static int syscall_enter_register(struct ftrace_event_call *event,
+static int syscall_enter_register(struct trace_event_call *event,
enum trace_reg type, void *data);
-static int syscall_exit_register(struct ftrace_event_call *event,
+static int syscall_exit_register(struct trace_event_call *event,
enum trace_reg type, void *data);
static struct list_head *
-syscall_get_enter_fields(struct ftrace_event_call *call)
+syscall_get_enter_fields(struct trace_event_call *call)
{
struct syscall_metadata *entry = call->data;
@@ -219,7 +219,7 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
return pos;
}
-static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
+static int __init set_syscall_print_fmt(struct trace_event_call *call)
{
char *print_fmt;
int len;
@@ -244,7 +244,7 @@ static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
return 0;
}
-static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
+static void __init free_syscall_print_fmt(struct trace_event_call *call)
{
struct syscall_metadata *entry = call->data;
@@ -252,7 +252,7 @@ static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
kfree(call->print_fmt);
}
-static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
+static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
@@ -275,7 +275,7 @@ static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
return ret;
}
-static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
+static int __init syscall_exit_define_fields(struct trace_event_call *call)
{
struct syscall_trace_exit trace;
int ret;
@@ -293,7 +293,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
- struct ftrace_event_file *ftrace_file;
+ struct trace_event_file *trace_file;
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
@@ -308,11 +308,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
- ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
- if (!ftrace_file)
+ trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
+ if (!trace_file)
return;
- if (ftrace_trigger_soft_disabled(ftrace_file))
+ if (trace_trigger_soft_disabled(trace_file))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -334,14 +334,14 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
entry->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
- event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+ event_trigger_unlock_commit(trace_file, buffer, event, entry,
irq_flags, pc);
}
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
{
struct trace_array *tr = data;
- struct ftrace_event_file *ftrace_file;
+ struct trace_event_file *trace_file;
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
@@ -355,11 +355,11 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
- ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
- if (!ftrace_file)
+ trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
+ if (!trace_file)
return;
- if (ftrace_trigger_soft_disabled(ftrace_file))
+ if (trace_trigger_soft_disabled(trace_file))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -380,12 +380,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
entry->nr = syscall_nr;
entry->ret = syscall_get_return_value(current, regs);
- event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+ event_trigger_unlock_commit(trace_file, buffer, event, entry,
irq_flags, pc);
}
-static int reg_event_syscall_enter(struct ftrace_event_file *file,
- struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct trace_event_file *file,
+ struct trace_event_call *call)
{
struct trace_array *tr = file->tr;
int ret = 0;
@@ -405,8 +405,8 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file,
return ret;
}
-static void unreg_event_syscall_enter(struct ftrace_event_file *file,
- struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct trace_event_file *file,
+ struct trace_event_call *call)
{
struct trace_array *tr = file->tr;
int num;
@@ -422,8 +422,8 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
mutex_unlock(&syscall_trace_lock);
}
-static int reg_event_syscall_exit(struct ftrace_event_file *file,
- struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct trace_event_file *file,
+ struct trace_event_call *call)
{
struct trace_array *tr = file->tr;
int ret = 0;
@@ -443,8 +443,8 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file,
return ret;
}
-static void unreg_event_syscall_exit(struct ftrace_event_file *file,
- struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct trace_event_file *file,
+ struct trace_event_call *call)
{
struct trace_array *tr = file->tr;
int num;
@@ -460,7 +460,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
mutex_unlock(&syscall_trace_lock);
}
-static int __init init_syscall_trace(struct ftrace_event_call *call)
+static int __init init_syscall_trace(struct trace_event_call *call)
{
int id;
int num;
@@ -493,7 +493,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
.trace = print_syscall_exit,
};
-struct ftrace_event_class __refdata event_class_syscall_enter = {
+struct trace_event_class __refdata event_class_syscall_enter = {
.system = "syscalls",
.reg = syscall_enter_register,
.define_fields = syscall_enter_define_fields,
@@ -501,7 +501,7 @@ struct ftrace_event_class __refdata event_class_syscall_enter = {
.raw_init = init_syscall_trace,
};
-struct ftrace_event_class __refdata event_class_syscall_exit = {
+struct trace_event_class __refdata event_class_syscall_exit = {
.system = "syscalls",
.reg = syscall_exit_register,
.define_fields = syscall_exit_define_fields,
@@ -584,7 +584,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
}
-static int perf_sysenter_enable(struct ftrace_event_call *call)
+static int perf_sysenter_enable(struct trace_event_call *call)
{
int ret = 0;
int num;
@@ -605,7 +605,7 @@ static int perf_sysenter_enable(struct ftrace_event_call *call)
return ret;
}
-static void perf_sysenter_disable(struct ftrace_event_call *call)
+static void perf_sysenter_disable(struct trace_event_call *call)
{
int num;
@@ -656,7 +656,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
}
-static int perf_sysexit_enable(struct ftrace_event_call *call)
+static int perf_sysexit_enable(struct trace_event_call *call)
{
int ret = 0;
int num;
@@ -677,7 +677,7 @@ static int perf_sysexit_enable(struct ftrace_event_call *call)
return ret;
}
-static void perf_sysexit_disable(struct ftrace_event_call *call)
+static void perf_sysexit_disable(struct trace_event_call *call)
{
int num;
@@ -693,10 +693,10 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
#endif /* CONFIG_PERF_EVENTS */
-static int syscall_enter_register(struct ftrace_event_call *event,
+static int syscall_enter_register(struct trace_event_call *event,
enum trace_reg type, void *data)
{
- struct ftrace_event_file *file = data;
+ struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
@@ -721,10 +721,10 @@ static int syscall_enter_register(struct ftrace_event_call *event,
return 0;
}
-static int syscall_exit_register(struct ftrace_event_call *event,
+static int syscall_exit_register(struct trace_event_call *event,
enum trace_reg type, void *data)
{
- struct ftrace_event_file *file = data;
+ struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7dc1c8abecd6..d2f6d0be3503 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string)
DEFINE_FETCH_file_offset(string_size)
/* Fetch type information table */
-const struct fetch_type uprobes_fetch_type_table[] = {
+static const struct fetch_type uprobes_fetch_type_table[] = {
/* Special types */
[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
sizeof(u32), 1, "__data_loc char[]"),
@@ -293,7 +293,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
struct trace_uprobe *tu;
list_for_each_entry(tu, &uprobe_list, list)
- if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 &&
+ if (strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
strcmp(tu->tp.call.class->system, group) == 0)
return tu;
@@ -323,7 +323,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
mutex_lock(&uprobe_lock);
/* register as an event */
- old_tu = find_probe_event(ftrace_event_name(&tu->tp.call),
+ old_tu = find_probe_event(trace_event_name(&tu->tp.call),
tu->tp.call.class->system);
if (old_tu) {
/* delete old event */
@@ -443,7 +443,7 @@ static int create_trace_uprobe(int argc, char **argv)
if (ret)
goto fail_address_parse;
- inode = igrab(path.dentry->d_inode);
+ inode = igrab(d_inode(path.dentry));
path_put(&path);
if (!inode || !S_ISREG(inode->i_mode)) {
@@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv)
/* Parse fetch argument */
ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
- is_return, false);
+ is_return, false,
+ uprobes_fetch_type_table);
if (ret) {
pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
goto error;
@@ -599,8 +600,23 @@ static int probes_seq_show(struct seq_file *m, void *v)
int i;
seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
- ftrace_event_name(&tu->tp.call));
- seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
+ trace_event_name(&tu->tp.call));
+ seq_printf(m, " %s:", tu->filename);
+
+ /* Don't print "0x (null)" when offset is 0 */
+ if (tu->offset) {
+ seq_printf(m, "0x%p", (void *)tu->offset);
+ } else {
+ switch (sizeof(void *)) {
+ case 4:
+ seq_printf(m, "0x00000000");
+ break;
+ case 8:
+ default:
+ seq_printf(m, "0x0000000000000000");
+ break;
+ }
+ }
for (i = 0; i < tu->tp.nr_args; i++)
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
@@ -650,7 +666,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
struct trace_uprobe *tu = v;
seq_printf(m, " %s %-44s %15lu\n", tu->filename,
- ftrace_event_name(&tu->tp.call), tu->nhit);
+ trace_event_name(&tu->tp.call), tu->nhit);
return 0;
}
@@ -769,26 +785,26 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize,
- struct ftrace_event_file *ftrace_file)
+ struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
void *data;
int size, esize;
- struct ftrace_event_call *call = &tu->tp.call;
+ struct trace_event_call *call = &tu->tp.call;
- WARN_ON(call != ftrace_file->event_call);
+ WARN_ON(call != trace_file->event_call);
if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
return;
- if (ftrace_trigger_soft_disabled(ftrace_file))
+ if (trace_trigger_soft_disabled(trace_file))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
- event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ event = trace_event_buffer_lock_reserve(&buffer, trace_file,
call->event.type, size, 0, 0);
if (!event)
return;
@@ -805,7 +821,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
memcpy(data, ucb->buf, tu->tp.size + dsize);
- event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
+ event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
}
/* uprobe handler */
@@ -852,12 +868,12 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
if (is_ret_probe(tu)) {
trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
- ftrace_event_name(&tu->tp.call),
+ trace_event_name(&tu->tp.call),
entry->vaddr[1], entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
trace_seq_printf(s, "%s: (0x%lx)",
- ftrace_event_name(&tu->tp.call),
+ trace_event_name(&tu->tp.call),
entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, false);
}
@@ -880,7 +896,7 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
struct mm_struct *mm);
static int
-probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
+probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
filter_func_t filter)
{
bool enabled = trace_probe_is_enabled(&tu->tp);
@@ -937,7 +953,7 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
}
static void
-probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
+probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
{
if (!trace_probe_is_enabled(&tu->tp))
return;
@@ -966,7 +982,7 @@ probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
uprobe_buffer_disable();
}
-static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int uprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret, i, size;
struct uprobe_trace_entry_head field;
@@ -1005,7 +1021,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
return true;
list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
- if (event->hw.tp_target->mm == mm)
+ if (event->hw.target->mm == mm)
return true;
}
@@ -1015,7 +1031,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
static inline bool
uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
{
- return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+ return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
}
static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
@@ -1023,10 +1039,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
bool done;
write_lock(&tu->filter.rwlock);
- if (event->hw.tp_target) {
+ if (event->hw.target) {
list_del(&event->hw.tp_list);
done = tu->filter.nr_systemwide ||
- (event->hw.tp_target->flags & PF_EXITING) ||
+ (event->hw.target->flags & PF_EXITING) ||
uprobe_filter_event(tu, event);
} else {
tu->filter.nr_systemwide--;
@@ -1046,7 +1062,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
int err;
write_lock(&tu->filter.rwlock);
- if (event->hw.tp_target) {
+ if (event->hw.target) {
/*
* event->parent != NULL means copy_process(), we can avoid
* uprobe_apply(). current->mm must be probed and we can rely
@@ -1092,13 +1108,17 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize)
{
- struct ftrace_event_call *call = &tu->tp.call;
+ struct trace_event_call *call = &tu->tp.call;
struct uprobe_trace_entry_head *entry;
+ struct bpf_prog *prog = call->prog;
struct hlist_head *head;
void *data;
int size, esize;
int rctx;
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
@@ -1158,11 +1178,11 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
#endif /* CONFIG_PERF_EVENTS */
static int
-trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
+trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
void *data)
{
struct trace_uprobe *tu = event->data;
- struct ftrace_event_file *file = data;
+ struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
@@ -1271,10 +1291,10 @@ static struct trace_event_functions uprobe_funcs = {
static int register_uprobe_event(struct trace_uprobe *tu)
{
- struct ftrace_event_call *call = &tu->tp.call;
+ struct trace_event_call *call = &tu->tp.call;
int ret;
- /* Initialize ftrace_event_call */
+ /* Initialize trace_event_call */
INIT_LIST_HEAD(&call->class->fields);
call->event.funcs = &uprobe_funcs;
call->class->define_fields = uprobe_event_define_fields;
@@ -1282,21 +1302,22 @@ static int register_uprobe_event(struct trace_uprobe *tu)
if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
return -ENOMEM;
- ret = register_ftrace_event(&call->event);
+ ret = register_trace_event(&call->event);
if (!ret) {
kfree(call->print_fmt);
return -ENODEV;
}
+ call->flags = TRACE_EVENT_FL_UPROBE;
call->class->reg = trace_uprobe_register;
call->data = tu;
ret = trace_add_event_call(call);
if (ret) {
pr_info("Failed to register uprobe event: %s\n",
- ftrace_event_name(call));
+ trace_event_name(call));
kfree(call->print_fmt);
- unregister_ftrace_event(&call->event);
+ unregister_trace_event(&call->event);
}
return ret;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4109f8320684..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->cap_inheritable = CAP_EMPTY_SET;
cred->cap_permitted = CAP_FULL_SET;
cred->cap_effective = CAP_FULL_SET;
+ cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
@@ -976,8 +977,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
if (user_ns == current_user_ns())
return -EINVAL;
- /* Threaded processes may not enter a different user namespace */
- if (atomic_read(&current->mm->mm_users) > 1)
+ /* Tasks that share a thread group must share a user namespace */
+ if (!thread_group_empty(current))
return -EINVAL;
if (current->fs->users != 1)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3174bf8e3538..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,20 +19,74 @@
#include <linux/sysctl.h>
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
+#include <linux/tick.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
#include <linux/perf_event.h>
+#include <linux/kthread.h>
-int watchdog_user_enabled = 1;
+/*
+ * The run state of the lockup detectors is controlled by the content of the
+ * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
+ * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
+ *
+ * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
+ * are variables that are only used as an 'interface' between the parameters
+ * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
+ * 'watchdog_thresh' variable is handled differently because its value is not
+ * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
+ * is equal zero.
+ */
+#define NMI_WATCHDOG_ENABLED_BIT 0
+#define SOFT_WATCHDOG_ENABLED_BIT 1
+#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
+#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
+
+static DEFINE_MUTEX(watchdog_proc_mutex);
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#else
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+#endif
+int __read_mostly nmi_watchdog_enabled;
+int __read_mostly soft_watchdog_enabled;
+int __read_mostly watchdog_user_enabled;
int __read_mostly watchdog_thresh = 10;
+
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#endif
+static struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+/* Helper for online, unparked cpus. */
+#define for_each_watchdog_cpu(cpu) \
+ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
+
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -58,8 +112,6 @@ static unsigned long soft_lockup_nmi_warn;
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static int hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-
-static bool hardlockup_detector_enabled = true;
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -68,14 +120,9 @@ static bool hardlockup_detector_enabled = true;
* kernel command line parameters are parsed, because otherwise it is not
* possible to override this in hardlockup_panic_setup().
*/
-void watchdog_enable_hardlockup_detector(bool val)
+void hardlockup_detector_disable(void)
{
- hardlockup_detector_enabled = val;
-}
-
-bool watchdog_hardlockup_detector_is_enabled(void)
-{
- return hardlockup_detector_enabled;
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
}
static int __init hardlockup_panic_setup(char *str)
@@ -85,15 +132,9 @@ static int __init hardlockup_panic_setup(char *str)
else if (!strncmp(str, "nopanic", 7))
hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
- watchdog_user_enabled = 0;
- else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) {
- /*
- * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
- * has the same effect.
- */
- watchdog_user_enabled = 1;
- watchdog_enable_hardlockup_detector(true);
- }
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ else if (!strncmp(str, "1", 1))
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -112,19 +153,18 @@ __setup("softlockup_panic=", softlockup_panic_setup);
static int __init nowatchdog_setup(char *str)
{
- watchdog_user_enabled = 0;
+ watchdog_enabled = 0;
return 1;
}
__setup("nowatchdog", nowatchdog_setup);
-/* deprecated */
static int __init nosoftlockup_setup(char *str)
{
- watchdog_user_enabled = 0;
+ watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
-/* */
+
#ifdef CONFIG_SMP
static int __init softlockup_all_cpu_backtrace_setup(char *str)
{
@@ -194,7 +234,7 @@ void touch_all_softlockup_watchdogs(void)
* do we care if a 0 races with a timestamp?
* all it means is the softlock check starts one cycle later
*/
- for_each_online_cpu(cpu)
+ for_each_watchdog_cpu(cpu)
per_cpu(watchdog_touch_ts, cpu) = 0;
}
@@ -239,10 +279,11 @@ static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
- /* Warn about unreasonable delays: */
- if (time_after(now, touch_ts + get_softlockup_thresh()))
- return now - touch_ts;
-
+ if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
+ /* Warn about unreasonable delays. */
+ if (time_after(now, touch_ts + get_softlockup_thresh()))
+ return now - touch_ts;
+ }
return 0;
}
@@ -477,6 +518,21 @@ static void watchdog(unsigned int cpu)
__this_cpu_write(soft_lockup_hrtimer_cnt,
__this_cpu_read(hrtimer_interrupts));
__touch_watchdog();
+
+ /*
+ * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
+ * failure path. Check for failures that can occur asynchronously -
+ * for example, when CPUs are on-lined - and shut down the hardware
+ * perf event on each CPU accordingly.
+ *
+ * The only non-obvious place this bit can be cleared is through
+ * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a
+ * pr_info here would be too noisy as it would result in a message
+ * every few seconds if the hardlockup was disabled but the softlockup
+ * enabled.
+ */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ watchdog_nmi_disable(cpu);
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -492,14 +548,9 @@ static int watchdog_nmi_enable(unsigned int cpu)
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
- /*
- * Some kernels need to default hard lockup detection to
- * 'disabled', for example a guest on a hypervisor.
- */
- if (!watchdog_hardlockup_detector_is_enabled()) {
- event = ERR_PTR(-ENOENT);
- goto handle_err;
- }
+ /* nothing to do if the hard lockup detector is disabled */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ goto out;
/* is it already setup and enabled? */
if (event && event->state > PERF_EVENT_STATE_OFF)
@@ -515,7 +566,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
/* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-handle_err:
/* save cpu0 error for future comparision */
if (cpu == 0 && IS_ERR(event))
cpu0_err = PTR_ERR(event);
@@ -527,6 +577,18 @@ handle_err:
goto out_save;
}
+ /*
+ * Disable the hard lockup detector if _any_ CPU fails to set up
+ * set up the hardware perf event. The watchdog() function checks
+ * the NMI_WATCHDOG_ENABLED bit periodically.
+ *
+ * The barriers are for syncing up watchdog_enabled across all the
+ * cpus, as clear_bit() does not use barriers.
+ */
+ smp_mb__before_atomic();
+ clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+ smp_mb__after_atomic();
+
/* skip displaying the same error again */
if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
return PTR_ERR(event);
@@ -540,6 +602,9 @@ handle_err:
else
pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
cpu, PTR_ERR(event));
+
+ pr_info("Shutting down hard lockup detector on all cpus\n");
+
return PTR_ERR(event);
/* success path */
@@ -567,6 +632,7 @@ static void watchdog_nmi_disable(unsigned int cpu)
cpu0_err = 0;
}
}
+
#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
@@ -583,60 +649,108 @@ static struct smp_hotplug_thread watchdog_threads = {
.unpark = watchdog_enable,
};
-static void restart_watchdog_hrtimer(void *info)
+/*
+ * park all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static int watchdog_park_threads(void)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
- int ret;
+ int cpu, ret = 0;
+ get_online_cpus();
+ for_each_watchdog_cpu(cpu) {
+ ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+ if (ret)
+ break;
+ }
+ if (ret) {
+ for_each_watchdog_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ }
+ put_online_cpus();
+
+ return ret;
+}
+
+/*
+ * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static void watchdog_unpark_threads(void)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_watchdog_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ put_online_cpus();
+}
+
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+ int ret = 0;
+
+ mutex_lock(&watchdog_proc_mutex);
/*
- * No need to cancel and restart hrtimer if it is currently executing
- * because it will reprogram itself with the new period now.
- * We should never see it unqueued here because we are running per-cpu
- * with interrupts disabled.
+ * Multiple suspend requests can be active in parallel (counted by
+ * the 'watchdog_suspended' variable). If the watchdog threads are
+ * running, the first caller takes care that they will be parked.
+ * The state of 'watchdog_running' cannot change while a suspend
+ * request is active (see related code in 'proc' handlers).
*/
- ret = hrtimer_try_to_cancel(hrtimer);
- if (ret == 1)
- hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
+ if (watchdog_running && !watchdog_suspended)
+ ret = watchdog_park_threads();
+
+ if (ret == 0)
+ watchdog_suspended++;
+
+ mutex_unlock(&watchdog_proc_mutex);
+
+ return ret;
}
-static void update_timers(int cpu)
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
{
+ mutex_lock(&watchdog_proc_mutex);
+
+ watchdog_suspended--;
/*
- * Make sure that perf event counter will adopt to a new
- * sampling period. Updating the sampling period directly would
- * be much nicer but we do not have an API for that now so
- * let's use a big hammer.
- * Hrtimer will adopt the new period on the next tick but this
- * might be late already so we have to restart the timer as well.
+ * The watchdog threads are unparked if they were previously running
+ * and if there is no more active suspend request.
*/
- watchdog_nmi_disable(cpu);
- smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
- watchdog_nmi_enable(cpu);
+ if (watchdog_running && !watchdog_suspended)
+ watchdog_unpark_threads();
+
+ mutex_unlock(&watchdog_proc_mutex);
}
-static void update_timers_all_cpus(void)
+static void update_watchdog_all_cpus(void)
{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- update_timers(cpu);
- put_online_cpus();
+ watchdog_park_threads();
+ watchdog_unpark_threads();
}
-static int watchdog_enable_all_cpus(bool sample_period_changed)
+static int watchdog_enable_all_cpus(void)
{
int err = 0;
if (!watchdog_running) {
- err = smpboot_register_percpu_thread(&watchdog_threads);
+ err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+ &watchdog_cpumask);
if (err)
pr_err("Failed to create watchdog threads, disabled\n");
else
watchdog_running = 1;
- } else if (sample_period_changed) {
- update_timers_all_cpus();
+ } else {
+ /*
+ * Enable/disable the lockup detectors or
+ * change the sample period 'on the fly'.
+ */
+ update_watchdog_all_cpus();
}
return err;
@@ -654,58 +768,221 @@ static void watchdog_disable_all_cpus(void)
}
/*
- * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
+ * Update the run state of the lockup detectors.
*/
+static int proc_watchdog_update(void)
+{
+ int err = 0;
-int proc_dowatchdog(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ /*
+ * Watchdog threads won't be started if they are already active.
+ * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
+ * care of this. If those threads are already active, the sample
+ * period will be updated and the lockup detectors will be enabled
+ * or disabled 'on the fly'.
+ */
+ if (watchdog_enabled && watchdog_thresh)
+ err = watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+
+ return err;
+
+}
+
+/*
+ * common function for watchdog, nmi_watchdog and soft_watchdog parameter
+ *
+ * caller | table->data points to | 'which' contains the flag(s)
+ * -------------------|-----------------------|-----------------------------
+ * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
+ * | | with SOFT_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ */
+static int proc_watchdog_common(int which, struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old_thresh, old_enabled;
- bool old_hardlockup;
- static DEFINE_MUTEX(watchdog_proc_mutex);
+ int err, old, new;
+ int *watchdog_param = (int *)table->data;
mutex_lock(&watchdog_proc_mutex);
- old_thresh = ACCESS_ONCE(watchdog_thresh);
- old_enabled = ACCESS_ONCE(watchdog_user_enabled);
- old_hardlockup = watchdog_hardlockup_detector_is_enabled();
- err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (err || !write)
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
goto out;
+ }
- set_sample_period();
/*
- * Watchdog threads shouldn't be enabled if they are
- * disabled. The 'watchdog_running' variable check in
- * watchdog_*_all_cpus() function takes care of this.
+ * If the parameter is being read return the state of the corresponding
+ * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
+ * run state of the lockup detectors.
*/
- if (watchdog_user_enabled && watchdog_thresh) {
+ if (!write) {
+ *watchdog_param = (watchdog_enabled & which) != 0;
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ } else {
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (err)
+ goto out;
+
/*
- * Prevent a change in watchdog_thresh accidentally overriding
- * the enablement of the hardlockup detector.
+ * There is a race window between fetching the current value
+ * from 'watchdog_enabled' and storing the new value. During
+ * this race window, watchdog_nmi_enable() can sneak in and
+ * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
+ * The 'cmpxchg' detects this race and the loop retries.
*/
- if (watchdog_user_enabled != old_enabled)
- watchdog_enable_hardlockup_detector(true);
- err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
- } else
- watchdog_disable_all_cpus();
+ do {
+ old = watchdog_enabled;
+ /*
+ * If the parameter value is not zero set the
+ * corresponding bit(s), else clear it(them).
+ */
+ if (*watchdog_param)
+ new = old | which;
+ else
+ new = old & ~which;
+ } while (cmpxchg(&watchdog_enabled, old, new) != old);
+
+ /*
+ * Update the run state of the lockup detectors.
+ * Restore 'watchdog_enabled' on failure.
+ */
+ err = proc_watchdog_update();
+ if (err)
+ watchdog_enabled = old;
+ }
+out:
+ mutex_unlock(&watchdog_proc_mutex);
+ return err;
+}
+
+/*
+ * /proc/sys/kernel/watchdog
+ */
+int proc_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/nmi_watchdog
+ */
+int proc_nmi_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/soft_watchdog
+ */
+int proc_soft_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/watchdog_thresh
+ */
+int proc_watchdog_thresh(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err, old;
+
+ mutex_lock(&watchdog_proc_mutex);
+
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
+ old = ACCESS_ONCE(watchdog_thresh);
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (err || !write)
+ goto out;
+
+ /*
+ * Update the sample period.
+ * Restore 'watchdog_thresh' on failure.
+ */
+ set_sample_period();
+ err = proc_watchdog_update();
+ if (err)
+ watchdog_thresh = old;
+out:
+ mutex_unlock(&watchdog_proc_mutex);
+ return err;
+}
- /* Restore old values on failure */
- if (err) {
- watchdog_thresh = old_thresh;
- watchdog_user_enabled = old_enabled;
- watchdog_enable_hardlockup_detector(old_hardlockup);
+/*
+ * The cpumask is the mask of possible cpus that the watchdog can run
+ * on, not the mask of cpus it is actually running on. This allows the
+ * user to specify a mask that will include cpus that have not yet
+ * been brought online, if desired.
+ */
+int proc_watchdog_cpumask(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err;
+
+ mutex_lock(&watchdog_proc_mutex);
+
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
+ err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
+ if (!err && write) {
+ /* Remove impossible cpus to keep sysctl output cleaner. */
+ cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
+ cpu_possible_mask);
+
+ if (watchdog_running) {
+ /*
+ * Failure would be due to being unable to allocate
+ * a temporary cpumask, so we are likely not in a
+ * position to do much else to make things better.
+ */
+ if (smpboot_update_cpumask_percpu_thread(
+ &watchdog_threads, &watchdog_cpumask) != 0)
+ pr_err("cpumask update failed\n");
+ }
}
out:
mutex_unlock(&watchdog_proc_mutex);
return err;
}
+
#endif /* CONFIG_SYSCTL */
void __init lockup_detector_init(void)
{
set_sample_period();
- if (watchdog_user_enabled)
- watchdog_enable_all_cpus(false);
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_enabled()) {
+ pr_info("Disabling watchdog on nohz_full cores by default\n");
+ cpumask_copy(&watchdog_cpumask, housekeeping_mask);
+ } else
+ cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#else
+ cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#endif
+
+ if (watchdog_enabled)
+ watchdog_enable_all_cpus();
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 41ff75b478c6..ca71582fcfab 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -127,6 +127,11 @@ enum {
*
* PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
*
+ * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
+ *
+ * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or
+ * sched-RCU for reads.
+ *
* WQ: wq->mutex protected.
*
* WR: wq->mutex protected for writes. Sched-RCU protected for reads.
@@ -159,6 +164,7 @@ struct worker_pool {
/* see manage_workers() for details on the two manager mutexes */
struct mutex manager_arb; /* manager arbitration */
+ struct worker *manager; /* L: purely informational */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
@@ -230,7 +236,7 @@ struct wq_device;
*/
struct workqueue_struct {
struct list_head pwqs; /* WR: all pwqs of this wq */
- struct list_head list; /* PL: list of all workqueues */
+ struct list_head list; /* PR: list of all workqueues */
struct mutex mutex; /* protects this wq */
int work_color; /* WQ: current work color */
@@ -246,8 +252,8 @@ struct workqueue_struct {
int nr_drainers; /* WQ: drain in progress */
int saved_max_active; /* WQ: saved pwq max_active */
- struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */
- struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */
+ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
+ struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */
#ifdef CONFIG_SYSFS
struct wq_device *wq_dev; /* I: for sysfs interface */
@@ -257,10 +263,17 @@ struct workqueue_struct {
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
+ /*
+ * Destruction of workqueue_struct is sched-RCU protected to allow
+ * walking the workqueues list without grabbing wq_pool_mutex.
+ * This is used to dump all workqueues from sysrq.
+ */
+ struct rcu_head rcu;
+
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
- struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
+ struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};
static struct kmem_cache *pwq_cache;
@@ -272,12 +285,7 @@ static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);
/* see the comment above the definition of WQ_POWER_EFFICIENT */
-#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
-static bool wq_power_efficient = true;
-#else
-static bool wq_power_efficient;
-#endif
-
+static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);
static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
@@ -288,9 +296,11 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
-static LIST_HEAD(workqueues); /* PL: list of all workqueues */
+static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
+static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+
/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
cpu_worker_pools);
@@ -322,21 +332,26 @@ struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
-static void copy_workqueue_attrs(struct workqueue_attrs *to,
- const struct workqueue_attrs *from);
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
#define assert_rcu_or_pool_mutex() \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
- lockdep_is_held(&wq_pool_mutex), \
- "sched RCU or wq_pool_mutex should be held")
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&wq_pool_mutex), \
+ "sched RCU or wq_pool_mutex should be held")
#define assert_rcu_or_wq_mutex(wq) \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
- lockdep_is_held(&wq->mutex), \
- "sched RCU or wq->mutex should be held")
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&wq->mutex), \
+ "sched RCU or wq->mutex should be held")
+
+#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&wq->mutex) && \
+ !lockdep_is_held(&wq_pool_mutex), \
+ "sched RCU, wq->mutex or wq_pool_mutex should be held")
#define for_each_cpu_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
@@ -542,7 +557,8 @@ static int worker_pool_assign_id(struct worker_pool *pool)
* @wq: the target workqueue
* @node: the node ID
*
- * This must be called either with pwq_lock held or sched RCU read locked.
+ * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
+ * read locked.
* If the pwq needs to be used beyond the locking in effect, the caller is
* responsible for guaranteeing that the pwq stays online.
*
@@ -551,7 +567,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
int node)
{
- assert_rcu_or_wq_mutex(wq);
+ assert_rcu_or_wq_mutex_or_pool_mutex(wq);
return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}
@@ -967,7 +983,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
* move_linked_works - move linked works to a list
* @work: start of series of works to be scheduled
* @head: target list to append @work to
- * @nextp: out paramter for nested worklist walking
+ * @nextp: out parameter for nested worklist walking
*
* Schedule linked works starting from @work to @head. Work series to
* be scheduled starts at @work and includes any consecutive work with
@@ -1698,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool)
goto fail;
set_user_nice(worker->task, pool->attrs->nice);
-
- /* prevent userland from meddling with cpumask of workqueue workers */
- worker->task->flags |= PF_NO_SETAFFINITY;
+ kthread_bind_mask(worker->task, pool->attrs->cpumask);
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
@@ -1911,9 +1925,11 @@ static bool manage_workers(struct worker *worker)
*/
if (!mutex_trylock(&pool->manager_arb))
return false;
+ pool->manager = worker;
maybe_create_worker(pool);
+ pool->manager = NULL;
mutex_unlock(&pool->manager_arb);
return true;
}
@@ -2303,6 +2319,7 @@ repeat:
struct wq_barrier {
struct work_struct work;
struct completion done;
+ struct task_struct *task; /* purely informational */
};
static void wq_barrier_func(struct work_struct *work)
@@ -2351,6 +2368,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
init_completion(&barr->done);
+ barr->task = current;
/*
* If @target is currently being executed, schedule the
@@ -2594,7 +2612,7 @@ void flush_workqueue(struct workqueue_struct *wq)
out_unlock:
mutex_unlock(&wq->mutex);
}
-EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue);
/**
* drain_workqueue - drain a workqueue
@@ -2603,7 +2621,7 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
* Wait until the workqueue becomes empty. While draining is in progress,
* only chain queueing is allowed. IOW, only currently pending or running
* work items on @wq can queue further work items on it. @wq is flushed
- * repeatedly until it becomes empty. The number of flushing is detemined
+ * repeatedly until it becomes empty. The number of flushing is determined
* by the depth of chaining and should be relatively short. Whine if it
* takes too long.
*/
@@ -2934,36 +2952,6 @@ int schedule_on_each_cpu(work_func_t func)
}
/**
- * flush_scheduled_work - ensure that any scheduled work has run to completion.
- *
- * Forces execution of the kernel-global workqueue and blocks until its
- * completion.
- *
- * Think twice before calling this function! It's very easy to get into
- * trouble if you don't take great care. Either of the following situations
- * will lead to deadlock:
- *
- * One of the work items currently on the workqueue needs to acquire
- * a lock held by your code or its caller.
- *
- * Your code is running in the context of a work routine.
- *
- * They will be detected by lockdep when they occur, but the first might not
- * occur very often. It depends on what work items are on the workqueue and
- * what locks they need, which you have no control over.
- *
- * In most situations flushing the entire workqueue is overkill; you merely
- * need to know that a particular work item isn't queued and isn't running.
- * In such cases you should use cancel_delayed_work_sync() or
- * cancel_work_sync() instead.
- */
-void flush_scheduled_work(void)
-{
- flush_workqueue(system_wq);
-}
-EXPORT_SYMBOL(flush_scheduled_work);
-
-/**
* execute_in_process_context - reliably execute the routine with user context
* @fn: the function to execute
* @ew: guaranteed storage for the execute work structure (must
@@ -2989,323 +2977,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
}
EXPORT_SYMBOL_GPL(execute_in_process_context);
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
- * following attributes.
- *
- * per_cpu RO bool : whether the workqueue is per-cpu or unbound
- * max_active RW int : maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
- *
- * id RO int : the associated pool ID
- * nice RW int : nice value of the workers
- * cpumask RW mask : bitmask of allowed CPUs for the workers
- */
-struct wq_device {
- struct workqueue_struct *wq;
- struct device dev;
-};
-
-static struct workqueue_struct *dev_to_wq(struct device *dev)
-{
- struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
- return wq_dev->wq;
-}
-
-static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
-}
-static DEVICE_ATTR_RO(per_cpu);
-
-static ssize_t max_active_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
-}
-
-static ssize_t max_active_store(struct device *dev,
- struct device_attribute *attr, const char *buf,
- size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int val;
-
- if (sscanf(buf, "%d", &val) != 1 || val <= 0)
- return -EINVAL;
-
- workqueue_set_max_active(wq, val);
- return count;
-}
-static DEVICE_ATTR_RW(max_active);
-
-static struct attribute *wq_sysfs_attrs[] = {
- &dev_attr_per_cpu.attr,
- &dev_attr_max_active.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(wq_sysfs);
-
-static ssize_t wq_pool_ids_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- const char *delim = "";
- int node, written = 0;
-
- rcu_read_lock_sched();
- for_each_node(node) {
- written += scnprintf(buf + written, PAGE_SIZE - written,
- "%s%d:%d", delim, node,
- unbound_pwq_by_node(wq, node)->pool->id);
- delim = " ";
- }
- written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- rcu_read_unlock_sched();
-
- return written;
-}
-
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
- mutex_unlock(&wq->mutex);
-
- return written;
-}
-
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
- struct workqueue_attrs *attrs;
-
- attrs = alloc_workqueue_attrs(GFP_KERNEL);
- if (!attrs)
- return NULL;
-
- mutex_lock(&wq->mutex);
- copy_workqueue_attrs(attrs, wq->unbound_attrs);
- mutex_unlock(&wq->mutex);
- return attrs;
-}
-
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- if (sscanf(buf, "%d", &attrs->nice) == 1 &&
- attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
- ret = apply_workqueue_attrs(wq, attrs);
- else
- ret = -EINVAL;
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static ssize_t wq_cpumask_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
- cpumask_pr_args(wq->unbound_attrs->cpumask));
- mutex_unlock(&wq->mutex);
- return written;
-}
-
-static ssize_t wq_cpumask_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- ret = cpumask_parse(buf, attrs->cpumask);
- if (!ret)
- ret = apply_workqueue_attrs(wq, attrs);
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n",
- !wq->unbound_attrs->no_numa);
- mutex_unlock(&wq->mutex);
-
- return written;
-}
-
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int v, ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- ret = -EINVAL;
- if (sscanf(buf, "%d", &v) == 1) {
- attrs->no_numa = !v;
- ret = apply_workqueue_attrs(wq, attrs);
- }
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
- __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
- __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
- __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
- __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
- __ATTR_NULL,
-};
-
-static struct bus_type wq_subsys = {
- .name = "workqueue",
- .dev_groups = wq_sysfs_groups,
-};
-
-static int __init wq_sysfs_init(void)
-{
- return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
-
-static void wq_device_release(struct device *dev)
-{
- struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
- kfree(wq_dev);
-}
-
-/**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
- *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
- *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
- *
- * Return: 0 on success, -errno on failure.
- */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
-{
- struct wq_device *wq_dev;
- int ret;
-
- /*
- * Adjusting max_active or creating new pwqs by applyting
- * attributes breaks ordering guarantee. Disallow exposing ordered
- * workqueues.
- */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
- return -EINVAL;
-
- wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
- if (!wq_dev)
- return -ENOMEM;
-
- wq_dev->wq = wq;
- wq_dev->dev.bus = &wq_subsys;
- wq_dev->dev.init_name = wq->name;
- wq_dev->dev.release = wq_device_release;
-
- /*
- * unbound_attrs are created separately. Suppress uevent until
- * everything is ready.
- */
- dev_set_uevent_suppress(&wq_dev->dev, true);
-
- ret = device_register(&wq_dev->dev);
- if (ret) {
- kfree(wq_dev);
- wq->wq_dev = NULL;
- return ret;
- }
-
- if (wq->flags & WQ_UNBOUND) {
- struct device_attribute *attr;
-
- for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
- ret = device_create_file(&wq_dev->dev, attr);
- if (ret) {
- device_unregister(&wq_dev->dev);
- wq->wq_dev = NULL;
- return ret;
- }
- }
- }
-
- dev_set_uevent_suppress(&wq_dev->dev, false);
- kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
- return 0;
-}
-
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
- */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
-{
- struct wq_device *wq_dev = wq->wq_dev;
-
- if (!wq->wq_dev)
- return;
-
- wq->wq_dev = NULL;
- device_unregister(&wq_dev->dev);
-}
-#else /* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
-#endif /* CONFIG_SYSFS */
-
/**
* free_workqueue_attrs - free a workqueue_attrs
* @attrs: workqueue_attrs to free
@@ -3385,7 +3056,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
* init_worker_pool - initialize a newly zalloc'd worker_pool
* @pool: worker_pool to initialize
*
- * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs.
+ * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs.
*
* Return: 0 on success, -errno on failure. Even on failure, all fields
* inside @pool proper are initialized and put_unbound_pool() can be called
@@ -3424,6 +3095,20 @@ static int init_worker_pool(struct worker_pool *pool)
return 0;
}
+static void rcu_free_wq(struct rcu_head *rcu)
+{
+ struct workqueue_struct *wq =
+ container_of(rcu, struct workqueue_struct, rcu);
+
+ if (!(wq->flags & WQ_UNBOUND))
+ free_percpu(wq->cpu_pwqs);
+ else
+ free_workqueue_attrs(wq->unbound_attrs);
+
+ kfree(wq->rescuer);
+ kfree(wq);
+}
+
static void rcu_free_pool(struct rcu_head *rcu)
{
struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
@@ -3601,12 +3286,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
/*
* If we're the last pwq going away, @wq is already dead and no one
- * is gonna access it anymore. Free it.
+ * is gonna access it anymore. Schedule RCU free.
*/
- if (is_last) {
- free_workqueue_attrs(wq->unbound_attrs);
- kfree(wq);
- }
+ if (is_last)
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
}
/**
@@ -3717,20 +3400,9 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
return pwq;
}
-/* undo alloc_unbound_pwq(), used only in the error path */
-static void free_unbound_pwq(struct pool_workqueue *pwq)
-{
- lockdep_assert_held(&wq_pool_mutex);
-
- if (pwq) {
- put_unbound_pool(pwq->pool);
- kmem_cache_free(pwq_cache, pwq);
- }
-}
-
/**
- * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
- * @attrs: the wq_attrs of interest
+ * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
+ * @attrs: the wq_attrs of the default pwq of the target workqueue
* @node: the target NUMA node
* @cpu_going_down: if >= 0, the CPU to consider as offline
* @cpumask: outarg, the resulting cpumask
@@ -3780,6 +3452,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
{
struct pool_workqueue *old_pwq;
+ lockdep_assert_held(&wq_pool_mutex);
lockdep_assert_held(&wq->mutex);
/* link_pwq() can handle duplicate calls */
@@ -3790,46 +3463,59 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
return old_pwq;
}
-/**
- * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
- * @wq: the target workqueue
- * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
- *
- * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on. Older pwqs are released as in-flight work
- * items finish. Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
- *
- * Performs GFP_KERNEL allocations.
- *
- * Return: 0 on success and -errno on failure.
- */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
- const struct workqueue_attrs *attrs)
+/* context to store the prepared attrs & pwqs before applying */
+struct apply_wqattrs_ctx {
+ struct workqueue_struct *wq; /* target workqueue */
+ struct workqueue_attrs *attrs; /* attrs to apply */
+ struct list_head list; /* queued for batching commit */
+ struct pool_workqueue *dfl_pwq;
+ struct pool_workqueue *pwq_tbl[];
+};
+
+/* free the resources after success or abort */
+static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
+{
+ if (ctx) {
+ int node;
+
+ for_each_node(node)
+ put_pwq_unlocked(ctx->pwq_tbl[node]);
+ put_pwq_unlocked(ctx->dfl_pwq);
+
+ free_workqueue_attrs(ctx->attrs);
+
+ kfree(ctx);
+ }
+}
+
+/* allocate the attrs and pwqs for later installation */
+static struct apply_wqattrs_ctx *
+apply_wqattrs_prepare(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
{
+ struct apply_wqattrs_ctx *ctx;
struct workqueue_attrs *new_attrs, *tmp_attrs;
- struct pool_workqueue **pwq_tbl, *dfl_pwq;
- int node, ret;
+ int node;
- /* only unbound workqueues can change attributes */
- if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
- return -EINVAL;
+ lockdep_assert_held(&wq_pool_mutex);
- /* creating multiple pwqs breaks ordering guarantee */
- if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
- return -EINVAL;
+ ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
+ GFP_KERNEL);
- pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
- if (!pwq_tbl || !new_attrs || !tmp_attrs)
- goto enomem;
+ if (!ctx || !new_attrs || !tmp_attrs)
+ goto out_free;
- /* make a copy of @attrs and sanitize it */
+ /*
+ * Calculate the attrs of the default pwq.
+ * If the user configured cpumask doesn't overlap with the
+ * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
+ */
copy_workqueue_attrs(new_attrs, attrs);
- cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+ if (unlikely(cpumask_empty(new_attrs->cpumask)))
+ cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
/*
* We may create multiple pwqs with differing cpumasks. Make a
@@ -3839,75 +3525,129 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
copy_workqueue_attrs(tmp_attrs, new_attrs);
/*
- * CPUs should stay stable across pwq creations and installations.
- * Pin CPUs, determine the target cpumask for each node and create
- * pwqs accordingly.
- */
- get_online_cpus();
-
- mutex_lock(&wq_pool_mutex);
-
- /*
* If something goes wrong during CPU up/down, we'll fall back to
* the default pwq covering whole @attrs->cpumask. Always create
* it even if we don't use it immediately.
*/
- dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
- if (!dfl_pwq)
- goto enomem_pwq;
+ ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+ if (!ctx->dfl_pwq)
+ goto out_free;
for_each_node(node) {
- if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
- pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
- if (!pwq_tbl[node])
- goto enomem_pwq;
+ if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
+ ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+ if (!ctx->pwq_tbl[node])
+ goto out_free;
} else {
- dfl_pwq->refcnt++;
- pwq_tbl[node] = dfl_pwq;
+ ctx->dfl_pwq->refcnt++;
+ ctx->pwq_tbl[node] = ctx->dfl_pwq;
}
}
- mutex_unlock(&wq_pool_mutex);
+ /* save the user configured attrs and sanitize it. */
+ copy_workqueue_attrs(new_attrs, attrs);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+ ctx->attrs = new_attrs;
+
+ ctx->wq = wq;
+ free_workqueue_attrs(tmp_attrs);
+ return ctx;
+
+out_free:
+ free_workqueue_attrs(tmp_attrs);
+ free_workqueue_attrs(new_attrs);
+ apply_wqattrs_cleanup(ctx);
+ return NULL;
+}
+
+/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
+static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
+{
+ int node;
/* all pwqs have been created successfully, let's install'em */
- mutex_lock(&wq->mutex);
+ mutex_lock(&ctx->wq->mutex);
- copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+ copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
/* save the previous pwq and install the new one */
for_each_node(node)
- pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+ ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
+ ctx->pwq_tbl[node]);
/* @dfl_pwq might not have been used, ensure it's linked */
- link_pwq(dfl_pwq);
- swap(wq->dfl_pwq, dfl_pwq);
+ link_pwq(ctx->dfl_pwq);
+ swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
- mutex_unlock(&wq->mutex);
+ mutex_unlock(&ctx->wq->mutex);
+}
- /* put the old pwqs */
- for_each_node(node)
- put_pwq_unlocked(pwq_tbl[node]);
- put_pwq_unlocked(dfl_pwq);
+static void apply_wqattrs_lock(void)
+{
+ /* CPUs should stay stable across pwq creations and installations */
+ get_online_cpus();
+ mutex_lock(&wq_pool_mutex);
+}
+static void apply_wqattrs_unlock(void)
+{
+ mutex_unlock(&wq_pool_mutex);
put_online_cpus();
- ret = 0;
- /* fall through */
-out_free:
- free_workqueue_attrs(tmp_attrs);
- free_workqueue_attrs(new_attrs);
- kfree(pwq_tbl);
+}
+
+static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
+{
+ struct apply_wqattrs_ctx *ctx;
+ int ret = -ENOMEM;
+
+ /* only unbound workqueues can change attributes */
+ if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+ return -EINVAL;
+
+ /* creating multiple pwqs breaks ordering guarantee */
+ if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+ return -EINVAL;
+
+ ctx = apply_wqattrs_prepare(wq, attrs);
+
+ /* the ctx has been prepared successfully, let's commit it */
+ if (ctx) {
+ apply_wqattrs_commit(ctx);
+ ret = 0;
+ }
+
+ apply_wqattrs_cleanup(ctx);
+
return ret;
+}
-enomem_pwq:
- free_unbound_pwq(dfl_pwq);
- for_each_node(node)
- if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
- free_unbound_pwq(pwq_tbl[node]);
- mutex_unlock(&wq_pool_mutex);
- put_online_cpus();
-enomem:
- ret = -ENOMEM;
- goto out_free;
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on. Older pwqs are released as in-flight work
+ * items finish. Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
+{
+ int ret;
+
+ apply_wqattrs_lock();
+ ret = apply_workqueue_attrs_locked(wq, attrs);
+ apply_wqattrs_unlock();
+
+ return ret;
}
/**
@@ -3943,7 +3683,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
lockdep_assert_held(&wq_pool_mutex);
- if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
+ if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
+ wq->unbound_attrs->no_numa)
return;
/*
@@ -3954,48 +3695,37 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
target_attrs = wq_update_unbound_numa_attrs_buf;
cpumask = target_attrs->cpumask;
- mutex_lock(&wq->mutex);
- if (wq->unbound_attrs->no_numa)
- goto out_unlock;
-
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
pwq = unbound_pwq_by_node(wq, node);
/*
* Let's determine what needs to be done. If the target cpumask is
- * different from wq's, we need to compare it to @pwq's and create
- * a new one if they don't match. If the target cpumask equals
- * wq's, the default pwq should be used.
+ * different from the default pwq's, we need to compare it to @pwq's
+ * and create a new one if they don't match. If the target cpumask
+ * equals the default pwq's, the default pwq should be used.
*/
- if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+ if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
- goto out_unlock;
+ return;
} else {
goto use_dfl_pwq;
}
- mutex_unlock(&wq->mutex);
-
/* create a new pwq */
pwq = alloc_unbound_pwq(wq, target_attrs);
if (!pwq) {
pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
wq->name);
- mutex_lock(&wq->mutex);
goto use_dfl_pwq;
}
- /*
- * Install the new pwq. As this function is called only from CPU
- * hotplug callbacks and applying a new attrs is wrapped with
- * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
- * inbetween.
- */
+ /* Install the new pwq. */
mutex_lock(&wq->mutex);
old_pwq = numa_pwq_tbl_install(wq, node, pwq);
goto out_unlock;
use_dfl_pwq:
+ mutex_lock(&wq->mutex);
spin_lock_irq(&wq->dfl_pwq->pool->lock);
get_pwq(wq->dfl_pwq);
spin_unlock_irq(&wq->dfl_pwq->pool->lock);
@@ -4124,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
}
wq->rescuer = rescuer;
- rescuer->task->flags |= PF_NO_SETAFFINITY;
+ kthread_bind_mask(rescuer->task, cpu_possible_mask);
wake_up_process(rescuer->task);
}
@@ -4143,7 +3873,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
- list_add(&wq->list, &workqueues);
+ list_add_tail_rcu(&wq->list, &workqueues);
mutex_unlock(&wq_pool_mutex);
@@ -4199,24 +3929,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
* flushing is complete in case freeze races us.
*/
mutex_lock(&wq_pool_mutex);
- list_del_init(&wq->list);
+ list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
workqueue_sysfs_unregister(wq);
- if (wq->rescuer) {
+ if (wq->rescuer)
kthread_stop(wq->rescuer->task);
- kfree(wq->rescuer);
- wq->rescuer = NULL;
- }
if (!(wq->flags & WQ_UNBOUND)) {
/*
* The base ref is never dropped on per-cpu pwqs. Directly
- * free the pwqs and wq.
+ * schedule RCU free.
*/
- free_percpu(wq->cpu_pwqs);
- kfree(wq);
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
} else {
/*
* We're the sole accessor of @wq at this point. Directly
@@ -4437,6 +4163,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
}
}
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+ pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
+ if (pool->node != NUMA_NO_NODE)
+ pr_cont(" node=%d", pool->node);
+ pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+ if (work->func == wq_barrier_func) {
+ struct wq_barrier *barr;
+
+ barr = container_of(work, struct wq_barrier, work);
+
+ pr_cont("%s BAR(%d)", comma ? "," : "",
+ task_pid_nr(barr->task));
+ } else {
+ pr_cont("%s %pf", comma ? "," : "", work->func);
+ }
+}
+
+static void show_pwq(struct pool_workqueue *pwq)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct work_struct *work;
+ struct worker *worker;
+ bool has_in_flight = false, has_pending = false;
+ int bkt;
+
+ pr_info(" pwq %d:", pool->id);
+ pr_cont_pool_info(pool);
+
+ pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq == pwq) {
+ has_in_flight = true;
+ break;
+ }
+ }
+ if (has_in_flight) {
+ bool comma = false;
+
+ pr_info(" in-flight:");
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq != pwq)
+ continue;
+
+ pr_cont("%s %d%s:%pf", comma ? "," : "",
+ task_pid_nr(worker->task),
+ worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+ worker->current_func);
+ list_for_each_entry(work, &worker->scheduled, entry)
+ pr_cont_work(false, work);
+ comma = true;
+ }
+ pr_cont("\n");
+ }
+
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ has_pending = true;
+ break;
+ }
+ }
+ if (has_pending) {
+ bool comma = false;
+
+ pr_info(" pending:");
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) != pwq)
+ continue;
+
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+
+ if (!list_empty(&pwq->delayed_works)) {
+ bool comma = false;
+
+ pr_info(" delayed:");
+ list_for_each_entry(work, &pwq->delayed_works, entry) {
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+}
+
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+ struct workqueue_struct *wq;
+ struct worker_pool *pool;
+ unsigned long flags;
+ int pi;
+
+ rcu_read_lock_sched();
+
+ pr_info("Showing busy workqueues and worker pools:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list) {
+ struct pool_workqueue *pwq;
+ bool idle = true;
+
+ for_each_pwq(pwq, wq) {
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+ idle = false;
+ break;
+ }
+ }
+ if (idle)
+ continue;
+
+ pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+
+ for_each_pwq(pwq, wq) {
+ spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ show_pwq(pwq);
+ spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ }
+ }
+
+ for_each_pool(pool, pi) {
+ struct worker *worker;
+ bool first = true;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->nr_workers == pool->nr_idle)
+ goto next_pool;
+
+ pr_info("pool %d:", pool->id);
+ pr_cont_pool_info(pool);
+ pr_cont(" workers=%d", pool->nr_workers);
+ if (pool->manager)
+ pr_cont(" manager: %d",
+ task_pid_nr(pool->manager->task));
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ pr_cont(" %s%d", first ? "idle: " : "",
+ task_pid_nr(worker->task));
+ first = false;
+ }
+ pr_cont("\n");
+ next_pool:
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ rcu_read_unlock_sched();
+}
+
/*
* CPU hotplug.
*
@@ -4521,7 +4407,7 @@ static void rebind_workers(struct worker_pool *pool)
/*
* Restore CPU affinity of all workers. As all idle workers should
* be on the run-queue of the associated CPU before any local
- * wake-ups for concurrency management happen, restore CPU affinty
+ * wake-ups for concurrency management happen, restore CPU affinity
* of all workers first and then clear UNBOUND. As we're called
* from CPU_ONLINE, the following shouldn't fail.
*/
@@ -4834,6 +4720,451 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */
+static int workqueue_apply_unbound_cpumask(void)
+{
+ LIST_HEAD(ctxs);
+ int ret = 0;
+ struct workqueue_struct *wq;
+ struct apply_wqattrs_ctx *ctx, *n;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ list_for_each_entry(wq, &workqueues, list) {
+ if (!(wq->flags & WQ_UNBOUND))
+ continue;
+ /* creating multiple pwqs breaks ordering guarantee */
+ if (wq->flags & __WQ_ORDERED)
+ continue;
+
+ ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
+ if (!ctx) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ list_add_tail(&ctx->list, &ctxs);
+ }
+
+ list_for_each_entry_safe(ctx, n, &ctxs, list) {
+ if (!ret)
+ apply_wqattrs_commit(ctx);
+ apply_wqattrs_cleanup(ctx);
+ }
+
+ return ret;
+}
+
+/**
+ * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ * @cpumask: the cpumask to set
+ *
+ * The low-level workqueues cpumask is a global cpumask that limits
+ * the affinity of all unbound workqueues. This function check the @cpumask
+ * and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ * Retun: 0 - Success
+ * -EINVAL - Invalid @cpumask
+ * -ENOMEM - Failed to allocate memory for attrs or pwqs.
+ */
+int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+ int ret = -EINVAL;
+ cpumask_var_t saved_cpumask;
+
+ if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_and(cpumask, cpumask, cpu_possible_mask);
+ if (!cpumask_empty(cpumask)) {
+ apply_wqattrs_lock();
+
+ /* save the old wq_unbound_cpumask. */
+ cpumask_copy(saved_cpumask, wq_unbound_cpumask);
+
+ /* update wq_unbound_cpumask at first and apply it to wqs. */
+ cpumask_copy(wq_unbound_cpumask, cpumask);
+ ret = workqueue_apply_unbound_cpumask();
+
+ /* restore the wq_unbound_cpumask when failed. */
+ if (ret < 0)
+ cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+
+ apply_wqattrs_unlock();
+ }
+
+ free_cpumask_var(saved_cpumask);
+ return ret;
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
+ * following attributes.
+ *
+ * per_cpu RO bool : whether the workqueue is per-cpu or unbound
+ * max_active RW int : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ * id RO int : the associated pool ID
+ * nice RW int : nice value of the workers
+ * cpumask RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+ struct workqueue_struct *wq;
+ struct device dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int val;
+
+ if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+ return -EINVAL;
+
+ workqueue_set_max_active(wq, val);
+ return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+ &dev_attr_per_cpu.attr,
+ &dev_attr_max_active.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ const char *delim = "";
+ int node, written = 0;
+
+ rcu_read_lock_sched();
+ for_each_node(node) {
+ written += scnprintf(buf + written, PAGE_SIZE - written,
+ "%s%d:%d", delim, node,
+ unbound_pwq_by_node(wq, node)->pool->id);
+ delim = " ";
+ }
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+ rcu_read_unlock_sched();
+
+ return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+ struct workqueue_attrs *attrs;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+
+ copy_workqueue_attrs(attrs, wq->unbound_attrs);
+ return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret = -ENOMEM;
+
+ apply_wqattrs_lock();
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ goto out_unlock;
+
+ if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+ attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+ ret = apply_workqueue_attrs_locked(wq, attrs);
+ else
+ ret = -EINVAL;
+
+out_unlock:
+ apply_wqattrs_unlock();
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+ cpumask_pr_args(wq->unbound_attrs->cpumask));
+ mutex_unlock(&wq->mutex);
+ return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret = -ENOMEM;
+
+ apply_wqattrs_lock();
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ goto out_unlock;
+
+ ret = cpumask_parse(buf, attrs->cpumask);
+ if (!ret)
+ ret = apply_workqueue_attrs_locked(wq, attrs);
+
+out_unlock:
+ apply_wqattrs_unlock();
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n",
+ !wq->unbound_attrs->no_numa);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret = -ENOMEM;
+
+ apply_wqattrs_lock();
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ goto out_unlock;
+
+ ret = -EINVAL;
+ if (sscanf(buf, "%d", &v) == 1) {
+ attrs->no_numa = !v;
+ ret = apply_workqueue_attrs_locked(wq, attrs);
+ }
+
+out_unlock:
+ apply_wqattrs_unlock();
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+ __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+ __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+ __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+ __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+ __ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+ .name = "workqueue",
+ .dev_groups = wq_sysfs_groups,
+};
+
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int written;
+
+ mutex_lock(&wq_pool_mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+ cpumask_pr_args(wq_unbound_cpumask));
+ mutex_unlock(&wq_pool_mutex);
+
+ return written;
+}
+
+static ssize_t wq_unbound_cpumask_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ cpumask_var_t cpumask;
+ int ret;
+
+ if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, cpumask);
+ if (!ret)
+ ret = workqueue_set_unbound_cpumask(cpumask);
+
+ free_cpumask_var(cpumask);
+ return ret ? ret : count;
+}
+
+static struct device_attribute wq_sysfs_cpumask_attr =
+ __ATTR(cpumask, 0644, wq_unbound_cpumask_show,
+ wq_unbound_cpumask_store);
+
+static int __init wq_sysfs_init(void)
+{
+ int err;
+
+ err = subsys_virtual_register(&wq_subsys, NULL);
+ if (err)
+ return err;
+
+ return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
+}
+core_initcall(wq_sysfs_init);
+
+static void wq_device_release(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev;
+ int ret;
+
+ /*
+ * Adjusting max_active or creating new pwqs by applying
+ * attributes breaks ordering guarantee. Disallow exposing ordered
+ * workqueues.
+ */
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
+ return -EINVAL;
+
+ wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+ if (!wq_dev)
+ return -ENOMEM;
+
+ wq_dev->wq = wq;
+ wq_dev->dev.bus = &wq_subsys;
+ wq_dev->dev.init_name = wq->name;
+ wq_dev->dev.release = wq_device_release;
+
+ /*
+ * unbound_attrs are created separately. Suppress uevent until
+ * everything is ready.
+ */
+ dev_set_uevent_suppress(&wq_dev->dev, true);
+
+ ret = device_register(&wq_dev->dev);
+ if (ret) {
+ kfree(wq_dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+
+ if (wq->flags & WQ_UNBOUND) {
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+ ret = device_create_file(&wq_dev->dev, attr);
+ if (ret) {
+ device_unregister(&wq_dev->dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+ }
+ }
+
+ dev_set_uevent_suppress(&wq_dev->dev, false);
+ kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+ return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev = wq->wq_dev;
+
+ if (!wq->wq_dev)
+ return;
+
+ wq->wq_dev = NULL;
+ device_unregister(&wq_dev->dev);
+}
+#else /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
+#endif /* CONFIG_SYSFS */
+
static void __init wq_numa_init(void)
{
cpumask_var_t *tbl;
@@ -4883,6 +5214,9 @@ static int __init init_workqueues(void)
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
+ BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+ cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);