summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c120
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/audit_tree.c8
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditfilter.c11
-rw-r--r--kernel/auditsc.c155
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c53
-rw-r--r--kernel/bpf/btf.c2348
-rw-r--r--kernel/bpf/cgroup.c48
-rw-r--r--kernel/bpf/core.c266
-rw-r--r--kernel/bpf/cpumap.c132
-rw-r--r--kernel/bpf/devmap.c138
-rw-r--r--kernel/bpf/disasm.c52
-rw-r--r--kernel/bpf/disasm.h5
-rw-r--r--kernel/bpf/hashtab.c12
-rw-r--r--kernel/bpf/helpers.c15
-rw-r--r--kernel/bpf/inode.c175
-rw-r--r--kernel/bpf/offload.c6
-rw-r--r--kernel/bpf/sockmap.c1708
-rw-r--r--kernel/bpf/stackmap.c367
-rw-r--r--kernel/bpf/syscall.c581
-rw-r--r--kernel/bpf/tnum.c10
-rw-r--r--kernel/bpf/verifier.c593
-rw-r--r--kernel/bpf/xskmap.c232
-rw-r--r--kernel/cgroup/Makefile2
-rw-r--r--kernel/cgroup/cgroup-internal.h13
-rw-r--r--kernel/cgroup/cgroup-v1.c14
-rw-r--r--kernel/cgroup/cgroup.c142
-rw-r--r--kernel/cgroup/rdma.c35
-rw-r--r--kernel/cgroup/rstat.c416
-rw-r--r--kernel/cgroup/stat.c338
-rw-r--r--kernel/compat.c108
-rw-r--r--kernel/cpu.c60
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/debug/kdb/kdb_bp.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c89
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/delayacct.c17
-rw-r--r--kernel/dma.c14
-rw-r--r--kernel/events/callchain.c25
-rw-r--r--kernel/events/core.c874
-rw-r--r--kernel/events/hw_breakpoint.c124
-rw-r--r--kernel/events/ring_buffer.c7
-rw-r--r--kernel/events/uprobes.c7
-rw-r--r--kernel/exec_domain.c15
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fail_function.c10
-rw-r--r--kernel/fork.c18
-rw-r--r--kernel/irq/Kconfig6
-rw-r--r--kernel/irq/affinity.c162
-rw-r--r--kernel/irq/autoprobe.c2
-rw-r--r--kernel/irq/chip.c10
-rw-r--r--kernel/irq/cpuhotplug.c1
-rw-r--r--kernel/irq/debugfs.c8
-rw-r--r--kernel/irq/devres.c1
-rw-r--r--kernel/irq/dummychip.c1
-rw-r--r--kernel/irq/generic-chip.c1
-rw-r--r--kernel/irq/handle.c23
-rw-r--r--kernel/irq/ipi.c3
-rw-r--r--kernel/irq/irq_sim.c21
-rw-r--r--kernel/irq/irqdesc.c23
-rw-r--r--kernel/irq/irqdomain.c2
-rw-r--r--kernel/irq/manage.c21
-rw-r--r--kernel/irq/matrix.c8
-rw-r--r--kernel/irq/msi.c36
-rw-r--r--kernel/irq/pm.c3
-rw-r--r--kernel/irq/proc.c84
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/irq/timings.c13
-rw-r--r--kernel/jump_label.c7
-rw-r--r--kernel/kexec.c52
-rw-r--r--kernel/kexec_file.c619
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/kthread.c48
-rw-r--r--kernel/livepatch/shadow.c108
-rw-r--r--kernel/locking/lockdep.c96
-rw-r--r--kernel/locking/lockdep_proc.c45
-rw-r--r--kernel/locking/mcs_spinlock.h10
-rw-r--r--kernel/locking/mutex.c40
-rw-r--r--kernel/locking/qspinlock.c247
-rw-r--r--kernel/locking/qspinlock_paravirt.h49
-rw-r--r--kernel/locking/qspinlock_stat.h9
-rw-r--r--kernel/locking/rtmutex.c3
-rw-r--r--kernel/locking/rtmutex_common.h11
-rw-r--r--kernel/locking/rwsem-xadd.c40
-rw-r--r--kernel/locking/rwsem.c6
-rw-r--r--kernel/locking/rwsem.h38
-rw-r--r--kernel/memremap.c1
-rw-r--r--kernel/module.c17
-rw-r--r--kernel/panic.c67
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/pid_namespace.c73
-rw-r--r--kernel/power/hibernate.c35
-rw-r--r--kernel/power/qos.c3
-rw-r--r--kernel/power/suspend.c29
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/power/user.c7
-rw-r--r--kernel/power/wakelock.c1
-rw-r--r--kernel/printk/printk.c80
-rw-r--r--kernel/printk/printk_safe.c7
-rw-r--r--kernel/rcu/rcu.h50
-rw-r--r--kernel/rcu/rcu_segcblist.c18
-rw-r--r--kernel/rcu/rcu_segcblist.h2
-rw-r--r--kernel/rcu/rcuperf.c23
-rw-r--r--kernel/rcu/rcutorture.c87
-rw-r--r--kernel/rcu/srcutiny.c9
-rw-r--r--kernel/rcu/srcutree.c59
-rw-r--r--kernel/rcu/tree.c430
-rw-r--r--kernel/rcu/tree.h72
-rw-r--r--kernel/rcu/tree_exp.h239
-rw-r--r--kernel/rcu/tree_plugin.h132
-rw-r--r--kernel/rcu/update.c50
-rw-r--r--kernel/resource.c46
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/autogroup.c28
-rw-r--r--kernel/sched/autogroup.h12
-rw-r--r--kernel/sched/clock.c36
-rw-r--r--kernel/sched/completion.c11
-rw-r--r--kernel/sched/core.c366
-rw-r--r--kernel/sched/cpuacct.c33
-rw-r--r--kernel/sched/cpudeadline.c23
-rw-r--r--kernel/sched/cpudeadline.h29
-rw-r--r--kernel/sched/cpufreq.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c477
-rw-r--r--kernel/sched/cpupri.c15
-rw-r--r--kernel/sched/cpupri.h25
-rw-r--r--kernel/sched/cputime.c58
-rw-r--r--kernel/sched/deadline.c94
-rw-r--r--kernel/sched/debug.c160
-rw-r--r--kernel/sched/fair.c1473
-rw-r--r--kernel/sched/features.h5
-rw-r--r--kernel/sched/idle.c176
-rw-r--r--kernel/sched/idle_task.c110
-rw-r--r--kernel/sched/isolation.c14
-rw-r--r--kernel/sched/loadavg.c34
-rw-r--r--kernel/sched/membarrier.c27
-rw-r--r--kernel/sched/rt.c66
-rw-r--r--kernel/sched/sched.h678
-rw-r--r--kernel/sched/stats.c33
-rw-r--r--kernel/sched/stats.h86
-rw-r--r--kernel/sched/stop_task.c11
-rw-r--r--kernel/sched/swait.c6
-rw-r--r--kernel/sched/topology.c48
-rw-r--r--kernel/sched/wait.c13
-rw-r--r--kernel/sched/wait_bit.c127
-rw-r--r--kernel/seccomp.c147
-rw-r--r--kernel/signal.c237
-rw-r--r--kernel/softirq.c89
-rw-r--r--kernel/stop_machine.c43
-rw-r--r--kernel/sys.c102
-rw-r--r--kernel/sys_ni.c629
-rw-r--r--kernel/sysctl.c35
-rw-r--r--kernel/sysctl_binary.c20
-rw-r--r--kernel/time/Kconfig10
-rw-r--r--kernel/time/alarmtimer.c34
-rw-r--r--kernel/time/clocksource.c152
-rw-r--r--kernel/time/hrtimer.c63
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/posix-cpu-timers.c4
-rw-r--r--kernel/time/posix-stubs.c22
-rw-r--r--kernel/time/posix-timers.c35
-rw-r--r--kernel/time/tick-broadcast.c8
-rw-r--r--kernel/time/tick-common.c5
-rw-r--r--kernel/time/tick-oneshot.c11
-rw-r--r--kernel/time/tick-sched.c302
-rw-r--r--kernel/time/tick-sched.h18
-rw-r--r--kernel/time/time.c72
-rw-r--r--kernel/time/timekeeping.c221
-rw-r--r--kernel/time/timekeeping_internal.h2
-rw-r--r--kernel/time/timer.c14
-rw-r--r--kernel/time/timer_list.c18
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/bpf_trace.c401
-rw-r--r--kernel/trace/ftrace.c11
-rw-r--r--kernel/trace/ring_buffer.c240
-rw-r--r--kernel/trace/trace.c164
-rw-r--r--kernel/trace/trace.h49
-rw-r--r--kernel/trace/trace_benchmark.c4
-rw-r--r--kernel/trace/trace_clock.c4
-rw-r--r--kernel/trace/trace_entries.h8
-rw-r--r--kernel/trace/trace_event_perf.c106
-rw-r--r--kernel/trace/trace_events.c36
-rw-r--r--kernel/trace/trace_events_filter.c2407
-rw-r--r--kernel/trace/trace_events_hist.c4456
-rw-r--r--kernel/trace/trace_events_trigger.c70
-rw-r--r--kernel/trace/trace_export.c9
-rw-r--r--kernel/trace/trace_kprobe.c126
-rw-r--r--kernel/trace/trace_printk.c4
-rw-r--r--kernel/trace/trace_probe.c8
-rw-r--r--kernel/trace/trace_probe.h13
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/trace/trace_uprobe.c152
-rw-r--r--kernel/trace/tracing_map.c232
-rw-r--r--kernel/trace/tracing_map.h18
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/ucount.c1
-rw-r--r--kernel/uid16.c25
-rw-r--r--kernel/uid16.h14
-rw-r--r--kernel/umh.c129
-rw-r--r--kernel/utsname.c20
-rw-r--r--kernel/workqueue.c215
-rw-r--r--kernel/workqueue_internal.h3
206 files changed, 20281 insertions, 8039 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 227db99b0f19..e7478cb58079 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -38,7 +38,8 @@
* 6) Support low-overhead kernel-based filtering to minimize the
* information that must be passed to user-space.
*
- * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
+ * Audit userspace, documentation, tests, and bug/issue trackers:
+ * https://github.com/linux-audit
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -180,9 +181,21 @@ static char *audit_feature_names[2] = {
"loginuid_immutable",
};
-
-/* Serialize requests from userspace. */
-DEFINE_MUTEX(audit_cmd_mutex);
+/**
+ * struct audit_ctl_mutex - serialize requests from userspace
+ * @lock: the mutex used for locking
+ * @owner: the task which owns the lock
+ *
+ * Description:
+ * This is the lock struct used to ensure we only process userspace requests
+ * in an orderly fashion. We can't simply use a mutex/lock here because we
+ * need to track lock ownership so we don't end up blocking the lock owner in
+ * audit_log_start() or similar.
+ */
+static struct audit_ctl_mutex {
+ struct mutex lock;
+ void *owner;
+} audit_cmd_mutex;
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -227,6 +240,36 @@ int auditd_test_task(struct task_struct *task)
}
/**
+ * audit_ctl_lock - Take the audit control lock
+ */
+void audit_ctl_lock(void)
+{
+ mutex_lock(&audit_cmd_mutex.lock);
+ audit_cmd_mutex.owner = current;
+}
+
+/**
+ * audit_ctl_unlock - Drop the audit control lock
+ */
+void audit_ctl_unlock(void)
+{
+ audit_cmd_mutex.owner = NULL;
+ mutex_unlock(&audit_cmd_mutex.lock);
+}
+
+/**
+ * audit_ctl_owner_current - Test to see if the current task owns the lock
+ *
+ * Description:
+ * Return true if the current task owns the audit control lock, false if it
+ * doesn't own the lock.
+ */
+static bool audit_ctl_owner_current(void)
+{
+ return (current == audit_cmd_mutex.owner);
+}
+
+/**
* auditd_pid_vnr - Return the auditd PID relative to the namespace
*
* Description:
@@ -443,15 +486,15 @@ static int audit_set_failure(u32 state)
* Drop any references inside the auditd connection tracking struct and free
* the memory.
*/
- static void auditd_conn_free(struct rcu_head *rcu)
- {
+static void auditd_conn_free(struct rcu_head *rcu)
+{
struct auditd_connection *ac;
ac = container_of(rcu, struct auditd_connection, rcu);
put_pid(ac->pid);
put_net(ac->net);
kfree(ac);
- }
+}
/**
* auditd_set - Set/Reset the auditd connection state
@@ -860,8 +903,8 @@ int audit_send_list(void *_dest)
struct sock *sk = audit_get_sk(dest->net);
/* wait for parent to finish and send an ACK */
- mutex_lock(&audit_cmd_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_lock();
+ audit_ctl_unlock();
while ((skb = __skb_dequeue(&dest->q)) != NULL)
netlink_unicast(sk, skb, dest->portid, 0);
@@ -902,8 +945,8 @@ static int audit_send_reply_thread(void *arg)
struct audit_reply *reply = (struct audit_reply *)arg;
struct sock *sk = audit_get_sk(reply->net);
- mutex_lock(&audit_cmd_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_lock();
+ audit_ctl_unlock();
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
@@ -1056,8 +1099,9 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
if (audit_enabled == AUDIT_OFF)
return;
-
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+ if (!ab)
+ return;
audit_log_task_info(ab, current);
audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
audit_feature_names[which], !!old_feature, !!new_feature,
@@ -1466,7 +1510,7 @@ static void audit_receive(struct sk_buff *skb)
nlh = nlmsg_hdr(skb);
len = skb->len;
- mutex_lock(&audit_cmd_mutex);
+ audit_ctl_lock();
while (nlmsg_ok(nlh, len)) {
err = audit_receive_msg(skb, nlh);
/* if err or if this message says it wants a response */
@@ -1475,7 +1519,7 @@ static void audit_receive(struct sk_buff *skb)
nlh = nlmsg_next(nlh, &len);
}
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_unlock();
}
/* Run custom bind function on netlink socket group connect or bind requests. */
@@ -1547,6 +1591,9 @@ static int __init audit_init(void)
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
+ mutex_init(&audit_cmd_mutex.lock);
+ audit_cmd_mutex.owner = NULL;
+
pr_info("initializing netlink subsys (%s)\n",
audit_default ? "enabled" : "disabled");
register_pernet_subsys(&audit_net_ops);
@@ -1567,19 +1614,26 @@ static int __init audit_init(void)
}
postcore_initcall(audit_init);
-/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
+/*
+ * Process kernel command-line parameter at boot time.
+ * audit={0|off} or audit={1|on}.
+ */
static int __init audit_enable(char *str)
{
- long val;
-
- if (kstrtol(str, 0, &val))
- panic("audit: invalid 'audit' parameter value (%s)\n", str);
- audit_default = (val ? AUDIT_ON : AUDIT_OFF);
+ if (!strcasecmp(str, "off") || !strcmp(str, "0"))
+ audit_default = AUDIT_OFF;
+ else if (!strcasecmp(str, "on") || !strcmp(str, "1"))
+ audit_default = AUDIT_ON;
+ else {
+ pr_err("audit: invalid 'audit' parameter value (%s)\n", str);
+ audit_default = AUDIT_ON;
+ }
if (audit_default == AUDIT_OFF)
audit_initialized = AUDIT_DISABLED;
if (audit_set_enabled(audit_default))
- panic("audit: error setting audit state (%d)\n", audit_default);
+ pr_err("audit: error setting audit state (%d)\n",
+ audit_default);
pr_info("%s\n", audit_default ?
"enabled (after initialization)" : "disabled (until reboot)");
@@ -1710,8 +1764,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
* while holding the mutex */
- if (!(auditd_test_task(current) ||
- (current == __mutex_owner(&audit_cmd_mutex)))) {
+ if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
while (audit_backlog_limit &&
@@ -2254,33 +2307,22 @@ EXPORT_SYMBOL(audit_log_task_info);
/**
* audit_log_link_denied - report a link restriction denial
* @operation: specific link operation
- * @link: the path that triggered the restriction
*/
-void audit_log_link_denied(const char *operation, const struct path *link)
+void audit_log_link_denied(const char *operation)
{
struct audit_buffer *ab;
- struct audit_names *name;
- name = kzalloc(sizeof(*name), GFP_NOFS);
- if (!name)
+ if (!audit_enabled || audit_dummy_context())
return;
/* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
- ab = audit_log_start(current->audit_context, GFP_KERNEL,
- AUDIT_ANOM_LINK);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK);
if (!ab)
- goto out;
+ return;
audit_log_format(ab, "op=%s", operation);
audit_log_task_info(ab, current);
audit_log_format(ab, " res=0");
audit_log_end(ab);
-
- /* Generate AUDIT_PATH record with object. */
- name->type = AUDIT_TYPE_NORMAL;
- audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry));
- audit_log_name(current->audit_context, name, link, 0, NULL);
-out:
- kfree(name);
}
/**
diff --git a/kernel/audit.h b/kernel/audit.h
index af5bc59487ed..214e14948370 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -341,4 +341,5 @@ extern struct list_head *audit_killed_trees(void);
#define audit_filter_inodes(t,c) AUDIT_DISABLED
#endif
-extern struct mutex audit_cmd_mutex;
+extern void audit_ctl_lock(void);
+extern void audit_ctl_unlock(void);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index fd353120e0d9..67e6956c0b61 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -709,7 +709,7 @@ static int prune_tree_thread(void *unused)
schedule();
}
- mutex_lock(&audit_cmd_mutex);
+ audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
while (!list_empty(&prune_list)) {
@@ -727,7 +727,7 @@ static int prune_tree_thread(void *unused)
}
mutex_unlock(&audit_filter_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_unlock();
}
return 0;
}
@@ -924,7 +924,7 @@ static void audit_schedule_prune(void)
*/
void audit_kill_trees(struct list_head *list)
{
- mutex_lock(&audit_cmd_mutex);
+ audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
while (!list_empty(list)) {
@@ -942,7 +942,7 @@ void audit_kill_trees(struct list_head *list)
}
mutex_unlock(&audit_filter_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_unlock();
}
/*
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9eb8b3511636..f1ba88994508 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -274,7 +274,7 @@ static void audit_update_watch(struct audit_parent *parent,
/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
if (invalidating && !audit_dummy_context())
- audit_filter_inodes(current, current->audit_context);
+ audit_filter_inodes(current, audit_context());
/* updating ino will likely change which audit_hash_list we
* are on so we need a new watch for the new list */
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4a1758adb222..eaa320148d97 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -258,8 +258,8 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
goto exit_err;
#ifdef CONFIG_AUDITSYSCALL
case AUDIT_FILTER_ENTRY:
- if (rule->action == AUDIT_ALWAYS)
- goto exit_err;
+ pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
+ goto exit_err;
case AUDIT_FILTER_EXIT:
case AUDIT_FILTER_TASK:
#endif
@@ -426,7 +426,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
return -EINVAL;
break;
case AUDIT_EXE:
- if (f->op != Audit_equal)
+ if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
if (entry->rule.listnr != AUDIT_FILTER_EXIT)
return -EINVAL;
@@ -496,7 +496,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
if (!gid_valid(f->gid))
goto exit_free;
break;
- case AUDIT_SESSIONID:
case AUDIT_ARCH:
entry->rule.arch_f = f;
break;
@@ -1090,8 +1089,6 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
struct audit_buffer *ab;
- uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
- unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -1099,7 +1096,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+ audit_log_session_info(ab);
audit_log_task_context(ab);
audit_log_format(ab, " op=%s", action);
audit_log_key(ab, rule->filterkey);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e80459f7e132..ceb1c4596c51 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -374,7 +374,7 @@ static int audit_field_compare(struct task_struct *tsk,
case AUDIT_COMPARE_EGID_TO_OBJ_GID:
return audit_compare_gid(cred->egid, name, f, ctx);
case AUDIT_COMPARE_AUID_TO_OBJ_UID:
- return audit_compare_uid(tsk->loginuid, name, f, ctx);
+ return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx);
case AUDIT_COMPARE_SUID_TO_OBJ_UID:
return audit_compare_uid(cred->suid, name, f, ctx);
case AUDIT_COMPARE_SGID_TO_OBJ_GID:
@@ -385,7 +385,8 @@ static int audit_field_compare(struct task_struct *tsk,
return audit_compare_gid(cred->fsgid, name, f, ctx);
/* uid comparisons */
case AUDIT_COMPARE_UID_TO_AUID:
- return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
+ return audit_uid_comparator(cred->uid, f->op,
+ audit_get_loginuid(tsk));
case AUDIT_COMPARE_UID_TO_EUID:
return audit_uid_comparator(cred->uid, f->op, cred->euid);
case AUDIT_COMPARE_UID_TO_SUID:
@@ -394,11 +395,14 @@ static int audit_field_compare(struct task_struct *tsk,
return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
/* auid comparisons */
case AUDIT_COMPARE_AUID_TO_EUID:
- return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
+ return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+ cred->euid);
case AUDIT_COMPARE_AUID_TO_SUID:
- return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
+ return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+ cred->suid);
case AUDIT_COMPARE_AUID_TO_FSUID:
- return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
+ return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+ cred->fsuid);
/* euid comparisons */
case AUDIT_COMPARE_EUID_TO_SUID:
return audit_uid_comparator(cred->euid, f->op, cred->suid);
@@ -471,6 +475,8 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_EXE:
result = audit_exe_compare(tsk, rule->exe);
+ if (f->op == Audit_not_equal)
+ result = !result;
break;
case AUDIT_UID:
result = audit_uid_comparator(cred->uid, f->op, f->uid);
@@ -511,7 +517,7 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
break;
case AUDIT_SESSIONID:
- sessionid = audit_get_sessionid(current);
+ sessionid = audit_get_sessionid(tsk);
result = audit_comparator(sessionid, f->op, f->val);
break;
case AUDIT_PERS:
@@ -609,7 +615,8 @@ static int audit_filter_rules(struct task_struct *tsk,
result = match_tree_refs(ctx, rule->tree);
break;
case AUDIT_LOGINUID:
- result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+ result = audit_uid_comparator(audit_get_loginuid(tsk),
+ f->op, f->uid);
break;
case AUDIT_LOGINUID_SET:
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
@@ -863,7 +870,7 @@ static inline struct audit_context *audit_take_context(struct task_struct *tsk,
audit_filter_inodes(tsk, context);
}
- tsk->audit_context = NULL;
+ audit_set_context(tsk, NULL);
return context;
}
@@ -950,7 +957,7 @@ int audit_alloc(struct task_struct *tsk)
}
context->filterkey = key;
- tsk->audit_context = context;
+ audit_set_context(tsk, context);
set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
return 0;
}
@@ -1507,34 +1514,31 @@ void __audit_free(struct task_struct *tsk)
void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
unsigned long a3, unsigned long a4)
{
- struct task_struct *tsk = current;
- struct audit_context *context = tsk->audit_context;
+ struct audit_context *context = audit_context();
enum audit_state state;
- if (!context)
+ if (!audit_enabled || !context)
return;
BUG_ON(context->in_syscall || context->name_count);
- if (!audit_enabled)
+ state = context->state;
+ if (state == AUDIT_DISABLED)
return;
+ context->dummy = !audit_n_rules;
+ if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
+ context->prio = 0;
+ if (auditd_test_task(current))
+ return;
+ }
+
context->arch = syscall_get_arch();
context->major = major;
context->argv[0] = a1;
context->argv[1] = a2;
context->argv[2] = a3;
context->argv[3] = a4;
-
- state = context->state;
- context->dummy = !audit_n_rules;
- if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
- context->prio = 0;
- state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
- }
- if (state == AUDIT_DISABLED)
- return;
-
context->serial = 0;
context->ctime = current_kernel_time64();
context->in_syscall = 1;
@@ -1555,7 +1559,6 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
*/
void __audit_syscall_exit(int success, long return_code)
{
- struct task_struct *tsk = current;
struct audit_context *context;
if (success)
@@ -1563,12 +1566,12 @@ void __audit_syscall_exit(int success, long return_code)
else
success = AUDITSC_FAILURE;
- context = audit_take_context(tsk, success, return_code);
+ context = audit_take_context(current, success, return_code);
if (!context)
return;
if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit(context, tsk);
+ audit_log_exit(context, current);
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
@@ -1590,7 +1593,7 @@ void __audit_syscall_exit(int success, long return_code)
kfree(context->filterkey);
context->filterkey = NULL;
}
- tsk->audit_context = context;
+ audit_set_context(current, context);
}
static inline void handle_one(const struct inode *inode)
@@ -1602,7 +1605,7 @@ static inline void handle_one(const struct inode *inode)
int count;
if (likely(!inode->i_fsnotify_marks))
return;
- context = current->audit_context;
+ context = audit_context();
p = context->trees;
count = context->tree_count;
rcu_read_lock();
@@ -1633,7 +1636,7 @@ static void handle_path(const struct dentry *dentry)
unsigned long seq;
int count;
- context = current->audit_context;
+ context = audit_context();
p = context->trees;
count = context->tree_count;
retry:
@@ -1715,7 +1718,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
struct filename *
__audit_reusename(const __user char *uptr)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct audit_names *n;
list_for_each_entry(n, &context->names_list, list) {
@@ -1738,7 +1741,7 @@ __audit_reusename(const __user char *uptr)
*/
void __audit_getname(struct filename *name)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct audit_names *n;
if (!context->in_syscall)
@@ -1766,7 +1769,7 @@ void __audit_getname(struct filename *name)
void __audit_inode(struct filename *name, const struct dentry *dentry,
unsigned int flags)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
@@ -1865,7 +1868,7 @@ void __audit_inode_child(struct inode *parent,
const struct dentry *dentry,
const unsigned char type)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct inode *inode = d_backing_inode(dentry);
const char *dname = dentry->d_name.name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
@@ -2050,7 +2053,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
int audit_set_loginuid(kuid_t loginuid)
{
struct task_struct *task = current;
- unsigned int oldsessionid, sessionid = (unsigned int)-1;
+ unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
kuid_t oldloginuid;
int rc;
@@ -2064,7 +2067,7 @@ int audit_set_loginuid(kuid_t loginuid)
/* are we setting or clearing? */
if (uid_valid(loginuid)) {
sessionid = (unsigned int)atomic_inc_return(&session_id);
- if (unlikely(sessionid == (unsigned int)-1))
+ if (unlikely(sessionid == AUDIT_SID_UNSET))
sessionid = (unsigned int)atomic_inc_return(&session_id);
}
@@ -2084,7 +2087,7 @@ out:
*/
void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (attr)
memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
@@ -2108,7 +2111,7 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
const struct timespec64 *abs_timeout)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct timespec64 *p = &context->mq_sendrecv.abs_timeout;
if (abs_timeout)
@@ -2132,7 +2135,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (notification)
context->mq_notify.sigev_signo = notification->sigev_signo;
@@ -2151,7 +2154,7 @@ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
*/
void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->mq_getsetattr.mqdes = mqdes;
context->mq_getsetattr.mqstat = *mqstat;
context->type = AUDIT_MQ_GETSETATTR;
@@ -2164,7 +2167,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
*/
void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->ipc.uid = ipcp->uid;
context->ipc.gid = ipcp->gid;
context->ipc.mode = ipcp->mode;
@@ -2184,7 +2187,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
*/
void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->ipc.qbytes = qbytes;
context->ipc.perm_uid = uid;
@@ -2195,7 +2198,7 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
void __audit_bprm(struct linux_binprm *bprm)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->type = AUDIT_EXECVE;
context->execve.argc = bprm->argc;
@@ -2210,7 +2213,7 @@ void __audit_bprm(struct linux_binprm *bprm)
*/
int __audit_socketcall(int nargs, unsigned long *args)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
return -EINVAL;
@@ -2228,7 +2231,7 @@ int __audit_socketcall(int nargs, unsigned long *args)
*/
void __audit_fd_pair(int fd1, int fd2)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->fds[0] = fd1;
context->fds[1] = fd2;
}
@@ -2242,7 +2245,7 @@ void __audit_fd_pair(int fd1, int fd2)
*/
int __audit_sockaddr(int len, void *a)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
@@ -2258,7 +2261,7 @@ int __audit_sockaddr(int len, void *a)
void __audit_ptrace(struct task_struct *t)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->target_pid = task_tgid_nr(t);
context->target_auid = audit_get_loginuid(t);
@@ -2279,19 +2282,19 @@ void __audit_ptrace(struct task_struct *t)
int audit_signal_info(int sig, struct task_struct *t)
{
struct audit_aux_data_pids *axp;
- struct task_struct *tsk = current;
- struct audit_context *ctx = tsk->audit_context;
- kuid_t uid = current_uid(), t_uid = task_uid(t);
+ struct audit_context *ctx = audit_context();
+ kuid_t uid = current_uid(), auid, t_uid = task_uid(t);
if (auditd_test_task(t) &&
(sig == SIGTERM || sig == SIGHUP ||
sig == SIGUSR1 || sig == SIGUSR2)) {
- audit_sig_pid = task_tgid_nr(tsk);
- if (uid_valid(tsk->loginuid))
- audit_sig_uid = tsk->loginuid;
+ audit_sig_pid = task_tgid_nr(current);
+ auid = audit_get_loginuid(current);
+ if (uid_valid(auid))
+ audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_task_getsecid(tsk, &audit_sig_sid);
+ security_task_getsecid(current, &audit_sig_sid);
}
if (!audit_signals || audit_dummy_context())
@@ -2347,7 +2350,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
const struct cred *new, const struct cred *old)
{
struct audit_aux_data_bprm_fcaps *ax;
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct cpu_vfs_cap_data vcaps;
ax = kmalloc(sizeof(*ax), GFP_KERNEL);
@@ -2387,7 +2390,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
*/
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
@@ -2398,7 +2401,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
void __audit_mmap_fd(int fd, int flags)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->mmap.fd = fd;
context->mmap.flags = flags;
context->type = AUDIT_MMAP;
@@ -2406,7 +2409,7 @@ void __audit_mmap_fd(int fd, int flags)
void __audit_log_kern_module(char *name)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
strcpy(context->module.name, name);
@@ -2415,7 +2418,7 @@ void __audit_log_kern_module(char *name)
void __audit_fanotify(unsigned int response)
{
- audit_log(current->audit_context, GFP_KERNEL,
+ audit_log(audit_context(), GFP_KERNEL,
AUDIT_FANOTIFY, "resp=%u", response);
}
@@ -2466,7 +2469,19 @@ void audit_core_dumps(long signr)
audit_log_end(ab);
}
-void __audit_seccomp(unsigned long syscall, long signr, int code)
+/**
+ * audit_seccomp - record information about a seccomp action
+ * @syscall: syscall number
+ * @signr: signal value
+ * @code: the seccomp action
+ *
+ * Record the information associated with a seccomp action. Event filtering for
+ * seccomp actions that are not to be logged is done in seccomp_log().
+ * Therefore, this function forces auditing independent of the audit_enabled
+ * and dummy context state because seccomp actions should be logged even when
+ * audit is not in use.
+ */
+void audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
@@ -2480,9 +2495,29 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
audit_log_end(ab);
}
+void audit_seccomp_actions_logged(const char *names, const char *old_names,
+ int res)
+{
+ struct audit_buffer *ab;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(audit_context(), GFP_KERNEL,
+ AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+
+ audit_log_format(ab, "op=seccomp-logging");
+ audit_log_format(ab, " actions=%s", names);
+ audit_log_format(ab, " old-actions=%s", old_names);
+ audit_log_format(ab, " res=%d", res);
+ audit_log_end(ab);
+}
+
struct list_head *audit_killed_trees(void)
{
- struct audit_context *ctx = current->audit_context;
+ struct audit_context *ctx = audit_context();
if (likely(!ctx || !ctx->in_syscall))
return NULL;
return &ctx->killed_trees;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a713fd23ec88..f27f5496d6fe 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,9 +4,13 @@ obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+ifeq ($(CONFIG_XDP_SOCKETS),y)
+obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
+endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o
ifeq ($(CONFIG_STREAM_PARSER),y)
ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 14750e7c5ee4..544e58f5f642 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,11 +11,13 @@
* General Public License for more details.
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
#include <linux/perf_event.h>
+#include <uapi/linux/btf.h>
#include "map_in_map.h"
@@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map)
bpf_map_area_free(array);
}
+static void array_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void *value;
+
+ rcu_read_lock();
+
+ value = array_map_lookup_elem(map, key);
+ if (!value) {
+ rcu_read_unlock();
+ return;
+ }
+
+ seq_printf(m, "%u: ", *(u32 *)key);
+ btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+ seq_puts(m, "\n");
+
+ rcu_read_unlock();
+}
+
+static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+ u32 btf_key_id, u32 btf_value_id)
+{
+ const struct btf_type *key_type, *value_type;
+ u32 key_size, value_size;
+ u32 int_data;
+
+ key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+ if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+ return -EINVAL;
+
+ int_data = *(u32 *)(key_type + 1);
+ /* bpf array can only take a u32 key. This check makes
+ * sure that the btf matches the attr used during map_create.
+ */
+ if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
+ BTF_INT_OFFSET(int_data))
+ return -EINVAL;
+
+ value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+ if (!value_type || value_size > map->value_size)
+ return -EINVAL;
+
+ return 0;
+}
+
const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
@@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = {
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_gen_lookup = array_map_gen_lookup,
+ .map_seq_show_elem = array_map_seq_show_elem,
+ .map_check_btf = array_map_check_btf,
};
const struct bpf_map_ops percpu_array_map_ops = {
@@ -476,7 +526,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
}
/* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -495,6 +545,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
+ .map_release_uref = bpf_fd_array_map_clear,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
new file mode 100644
index 000000000000..8653ab004c73
--- /dev/null
+++ b/kernel/bpf/btf.c
@@ -0,0 +1,2348 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <uapi/linux/btf.h>
+#include <uapi/linux/types.h>
+#include <linux/seq_file.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/sort.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+
+/* BTF (BPF Type Format) is the meta data format which describes
+ * the data types of BPF program/map. Hence, it basically focus
+ * on the C programming language which the modern BPF is primary
+ * using.
+ *
+ * ELF Section:
+ * ~~~~~~~~~~~
+ * The BTF data is stored under the ".BTF" ELF section
+ *
+ * struct btf_type:
+ * ~~~~~~~~~~~~~~~
+ * Each 'struct btf_type' object describes a C data type.
+ * Depending on the type it is describing, a 'struct btf_type'
+ * object may be followed by more data. F.e.
+ * To describe an array, 'struct btf_type' is followed by
+ * 'struct btf_array'.
+ *
+ * 'struct btf_type' and any extra data following it are
+ * 4 bytes aligned.
+ *
+ * Type section:
+ * ~~~~~~~~~~~~~
+ * The BTF type section contains a list of 'struct btf_type' objects.
+ * Each one describes a C type. Recall from the above section
+ * that a 'struct btf_type' object could be immediately followed by extra
+ * data in order to desribe some particular C types.
+ *
+ * type_id:
+ * ~~~~~~~
+ * Each btf_type object is identified by a type_id. The type_id
+ * is implicitly implied by the location of the btf_type object in
+ * the BTF type section. The first one has type_id 1. The second
+ * one has type_id 2...etc. Hence, an earlier btf_type has
+ * a smaller type_id.
+ *
+ * A btf_type object may refer to another btf_type object by using
+ * type_id (i.e. the "type" in the "struct btf_type").
+ *
+ * NOTE that we cannot assume any reference-order.
+ * A btf_type object can refer to an earlier btf_type object
+ * but it can also refer to a later btf_type object.
+ *
+ * For example, to describe "const void *". A btf_type
+ * object describing "const" may refer to another btf_type
+ * object describing "void *". This type-reference is done
+ * by specifying type_id:
+ *
+ * [1] CONST (anon) type_id=2
+ * [2] PTR (anon) type_id=0
+ *
+ * The above is the btf_verifier debug log:
+ * - Each line started with "[?]" is a btf_type object
+ * - [?] is the type_id of the btf_type object.
+ * - CONST/PTR is the BTF_KIND_XXX
+ * - "(anon)" is the name of the type. It just
+ * happens that CONST and PTR has no name.
+ * - type_id=XXX is the 'u32 type' in btf_type
+ *
+ * NOTE: "void" has type_id 0
+ *
+ * String section:
+ * ~~~~~~~~~~~~~~
+ * The BTF string section contains the names used by the type section.
+ * Each string is referred by an "offset" from the beginning of the
+ * string section.
+ *
+ * Each string is '\0' terminated.
+ *
+ * The first character in the string section must be '\0'
+ * which is used to mean 'anonymous'. Some btf_type may not
+ * have a name.
+ */
+
+/* BTF verification:
+ *
+ * To verify BTF data, two passes are needed.
+ *
+ * Pass #1
+ * ~~~~~~~
+ * The first pass is to collect all btf_type objects to
+ * an array: "btf->types".
+ *
+ * Depending on the C type that a btf_type is describing,
+ * a btf_type may be followed by extra data. We don't know
+ * how many btf_type is there, and more importantly we don't
+ * know where each btf_type is located in the type section.
+ *
+ * Without knowing the location of each type_id, most verifications
+ * cannot be done. e.g. an earlier btf_type may refer to a later
+ * btf_type (recall the "const void *" above), so we cannot
+ * check this type-reference in the first pass.
+ *
+ * In the first pass, it still does some verifications (e.g.
+ * checking the name is a valid offset to the string section).
+ *
+ * Pass #2
+ * ~~~~~~~
+ * The main focus is to resolve a btf_type that is referring
+ * to another type.
+ *
+ * We have to ensure the referring type:
+ * 1) does exist in the BTF (i.e. in btf->types[])
+ * 2) does not cause a loop:
+ * struct A {
+ * struct B b;
+ * };
+ *
+ * struct B {
+ * struct A a;
+ * };
+ *
+ * btf_type_needs_resolve() decides if a btf_type needs
+ * to be resolved.
+ *
+ * The needs_resolve type implements the "resolve()" ops which
+ * essentially does a DFS and detects backedge.
+ *
+ * During resolve (or DFS), different C types have different
+ * "RESOLVED" conditions.
+ *
+ * When resolving a BTF_KIND_STRUCT, we need to resolve all its
+ * members because a member is always referring to another
+ * type. A struct's member can be treated as "RESOLVED" if
+ * it is referring to a BTF_KIND_PTR. Otherwise, the
+ * following valid C struct would be rejected:
+ *
+ * struct A {
+ * int m;
+ * struct A *a;
+ * };
+ *
+ * When resolving a BTF_KIND_PTR, it needs to keep resolving if
+ * it is referring to another BTF_KIND_PTR. Otherwise, we cannot
+ * detect a pointer loop, e.g.:
+ * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR +
+ * ^ |
+ * +-----------------------------------------+
+ *
+ */
+
+#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
+#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
+#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
+#define BITS_ROUNDUP_BYTES(bits) \
+ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
+
+#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INT_MASK 0x0fffffff
+#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
+#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
+
+/* 16MB for 64k structs and each has 16 members and
+ * a few MB spaces for the string section.
+ * The hard limit is S32_MAX.
+ */
+#define BTF_MAX_SIZE (16 * 1024 * 1024)
+
+#define for_each_member(i, struct_type, member) \
+ for (i = 0, member = btf_type_member(struct_type); \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
+#define for_each_member_from(i, from, struct_type, member) \
+ for (i = from, member = btf_type_member(struct_type) + from; \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
+static DEFINE_IDR(btf_idr);
+static DEFINE_SPINLOCK(btf_idr_lock);
+
+struct btf {
+ void *data;
+ struct btf_type **types;
+ u32 *resolved_ids;
+ u32 *resolved_sizes;
+ const char *strings;
+ void *nohdr_data;
+ struct btf_header hdr;
+ u32 nr_types;
+ u32 types_size;
+ u32 data_size;
+ refcount_t refcnt;
+ u32 id;
+ struct rcu_head rcu;
+};
+
+enum verifier_phase {
+ CHECK_META,
+ CHECK_TYPE,
+};
+
+struct resolve_vertex {
+ const struct btf_type *t;
+ u32 type_id;
+ u16 next_member;
+};
+
+enum visit_state {
+ NOT_VISITED,
+ VISITED,
+ RESOLVED,
+};
+
+enum resolve_mode {
+ RESOLVE_TBD, /* To Be Determined */
+ RESOLVE_PTR, /* Resolving for Pointer */
+ RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union
+ * or array
+ */
+};
+
+#define MAX_RESOLVE_DEPTH 32
+
+struct btf_sec_info {
+ u32 off;
+ u32 len;
+};
+
+struct btf_verifier_env {
+ struct btf *btf;
+ u8 *visit_states;
+ struct resolve_vertex stack[MAX_RESOLVE_DEPTH];
+ struct bpf_verifier_log log;
+ u32 log_type_id;
+ u32 top_stack;
+ enum verifier_phase phase;
+ enum resolve_mode resolve_mode;
+};
+
+static const char * const btf_kind_str[NR_BTF_KINDS] = {
+ [BTF_KIND_UNKN] = "UNKNOWN",
+ [BTF_KIND_INT] = "INT",
+ [BTF_KIND_PTR] = "PTR",
+ [BTF_KIND_ARRAY] = "ARRAY",
+ [BTF_KIND_STRUCT] = "STRUCT",
+ [BTF_KIND_UNION] = "UNION",
+ [BTF_KIND_ENUM] = "ENUM",
+ [BTF_KIND_FWD] = "FWD",
+ [BTF_KIND_TYPEDEF] = "TYPEDEF",
+ [BTF_KIND_VOLATILE] = "VOLATILE",
+ [BTF_KIND_CONST] = "CONST",
+ [BTF_KIND_RESTRICT] = "RESTRICT",
+};
+
+struct btf_kind_operations {
+ s32 (*check_meta)(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left);
+ int (*resolve)(struct btf_verifier_env *env,
+ const struct resolve_vertex *v);
+ int (*check_member)(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type);
+ void (*log_details)(struct btf_verifier_env *env,
+ const struct btf_type *t);
+ void (*seq_show)(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct seq_file *m);
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
+static struct btf_type btf_void;
+
+static bool btf_type_is_modifier(const struct btf_type *t)
+{
+ /* Some of them is not strictly a C modifier
+ * but they are grouped into the same bucket
+ * for BTF concern:
+ * A type (t) that refers to another
+ * type through t->type AND its size cannot
+ * be determined without following the t->type.
+ *
+ * ptr does not fall into this bucket
+ * because its size is always sizeof(void *).
+ */
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_TYPEDEF:
+ case BTF_KIND_VOLATILE:
+ case BTF_KIND_CONST:
+ case BTF_KIND_RESTRICT:
+ return true;
+ }
+
+ return false;
+}
+
+static bool btf_type_is_void(const struct btf_type *t)
+{
+ /* void => no type and size info.
+ * Hence, FWD is also treated as void.
+ */
+ return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_void_or_null(const struct btf_type *t)
+{
+ return !t || btf_type_is_void(t);
+}
+
+/* union is only a special case of struct:
+ * all its offsetof(member) == 0
+ */
+static bool btf_type_is_struct(const struct btf_type *t)
+{
+ u8 kind = BTF_INFO_KIND(t->info);
+
+ return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
+static bool btf_type_is_array(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
+}
+
+static bool btf_type_is_ptr(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
+}
+
+static bool btf_type_is_int(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
+}
+
+/* What types need to be resolved?
+ *
+ * btf_type_is_modifier() is an obvious one.
+ *
+ * btf_type_is_struct() because its member refers to
+ * another type (through member->type).
+
+ * btf_type_is_array() because its element (array->type)
+ * refers to another type. Array can be thought of a
+ * special case of struct while array just has the same
+ * member-type repeated by array->nelems of times.
+ */
+static bool btf_type_needs_resolve(const struct btf_type *t)
+{
+ return btf_type_is_modifier(t) ||
+ btf_type_is_ptr(t) ||
+ btf_type_is_struct(t) ||
+ btf_type_is_array(t);
+}
+
+/* t->size can be used */
+static bool btf_type_has_size(const struct btf_type *t)
+{
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_INT:
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ case BTF_KIND_ENUM:
+ return true;
+ }
+
+ return false;
+}
+
+static const char *btf_int_encoding_str(u8 encoding)
+{
+ if (encoding == 0)
+ return "(none)";
+ else if (encoding == BTF_INT_SIGNED)
+ return "SIGNED";
+ else if (encoding == BTF_INT_CHAR)
+ return "CHAR";
+ else if (encoding == BTF_INT_BOOL)
+ return "BOOL";
+ else
+ return "UNKN";
+}
+
+static u16 btf_type_vlen(const struct btf_type *t)
+{
+ return BTF_INFO_VLEN(t->info);
+}
+
+static u32 btf_type_int(const struct btf_type *t)
+{
+ return *(u32 *)(t + 1);
+}
+
+static const struct btf_array *btf_type_array(const struct btf_type *t)
+{
+ return (const struct btf_array *)(t + 1);
+}
+
+static const struct btf_member *btf_type_member(const struct btf_type *t)
+{
+ return (const struct btf_member *)(t + 1);
+}
+
+static const struct btf_enum *btf_type_enum(const struct btf_type *t)
+{
+ return (const struct btf_enum *)(t + 1);
+}
+
+static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
+{
+ return kind_ops[BTF_INFO_KIND(t->info)];
+}
+
+static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+{
+ return BTF_STR_OFFSET_VALID(offset) &&
+ offset < btf->hdr.str_len;
+}
+
+static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+ if (!offset)
+ return "(anon)";
+ else if (offset < btf->hdr.str_len)
+ return &btf->strings[offset];
+ else
+ return "(invalid-name-offset)";
+}
+
+static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+{
+ if (type_id > btf->nr_types)
+ return NULL;
+
+ return btf->types[type_id];
+}
+
+/*
+ * Regular int is not a bit field and it must be either
+ * u8/u16/u32/u64.
+ */
+static bool btf_type_int_is_regular(const struct btf_type *t)
+{
+ u16 nr_bits, nr_bytes;
+ u32 int_data;
+
+ int_data = btf_type_int(t);
+ nr_bits = BTF_INT_BITS(int_data);
+ nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+ if (BITS_PER_BYTE_MASKED(nr_bits) ||
+ BTF_INT_OFFSET(int_data) ||
+ (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+ nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+ return false;
+ }
+
+ return true;
+}
+
+__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+
+__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+
+__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ bool log_details,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ u8 kind = BTF_INFO_KIND(t->info);
+ struct btf *btf = env->btf;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ __btf_verifier_log(log, "[%u] %s %s%s",
+ env->log_type_id,
+ btf_kind_str[kind],
+ btf_name_by_offset(btf, t->name_off),
+ log_details ? " " : "");
+
+ if (log_details)
+ btf_type_ops(t)->log_details(env, t);
+
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
+#define btf_verifier_log_type(env, t, ...) \
+ __btf_verifier_log_type((env), (t), true, __VA_ARGS__)
+#define btf_verifier_log_basic(env, t, ...) \
+ __btf_verifier_log_type((env), (t), false, __VA_ARGS__)
+
+__printf(4, 5)
+static void btf_verifier_log_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ struct btf *btf = env->btf;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ /* The CHECK_META phase already did a btf dump.
+ *
+ * If member is logged again, it must hit an error in
+ * parsing this member. It is useful to print out which
+ * struct this member belongs to.
+ */
+ if (env->phase != CHECK_META)
+ btf_verifier_log_type(env, struct_type, NULL);
+
+ __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+ btf_name_by_offset(btf, member->name_off),
+ member->type, member->offset);
+
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
+static void btf_verifier_log_hdr(struct btf_verifier_env *env,
+ u32 btf_data_size)
+{
+ struct bpf_verifier_log *log = &env->log;
+ const struct btf *btf = env->btf;
+ const struct btf_header *hdr;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ hdr = &btf->hdr;
+ __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
+ __btf_verifier_log(log, "version: %u\n", hdr->version);
+ __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
+ __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len);
+ __btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+ __btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
+ __btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
+ __btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
+ __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
+}
+
+static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
+{
+ struct btf *btf = env->btf;
+
+ /* < 2 because +1 for btf_void which is always in btf->types[0].
+ * btf_void is not accounted in btf->nr_types because btf_void
+ * does not come from the BTF file.
+ */
+ if (btf->types_size - btf->nr_types < 2) {
+ /* Expand 'types' array */
+
+ struct btf_type **new_types;
+ u32 expand_by, new_size;
+
+ if (btf->types_size == BTF_MAX_TYPE) {
+ btf_verifier_log(env, "Exceeded max num of types");
+ return -E2BIG;
+ }
+
+ expand_by = max_t(u32, btf->types_size >> 2, 16);
+ new_size = min_t(u32, BTF_MAX_TYPE,
+ btf->types_size + expand_by);
+
+ new_types = kvzalloc(new_size * sizeof(*new_types),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_types)
+ return -ENOMEM;
+
+ if (btf->nr_types == 0)
+ new_types[0] = &btf_void;
+ else
+ memcpy(new_types, btf->types,
+ sizeof(*btf->types) * (btf->nr_types + 1));
+
+ kvfree(btf->types);
+ btf->types = new_types;
+ btf->types_size = new_size;
+ }
+
+ btf->types[++(btf->nr_types)] = t;
+
+ return 0;
+}
+
+static int btf_alloc_id(struct btf *btf)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&btf_idr_lock);
+ id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ btf->id = id;
+ spin_unlock_bh(&btf_idr_lock);
+ idr_preload_end();
+
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void btf_free_id(struct btf *btf)
+{
+ unsigned long flags;
+
+ /*
+ * In map-in-map, calling map_delete_elem() on outer
+ * map will call bpf_map_put on the inner map.
+ * It will then eventually call btf_free_id()
+ * on the inner map. Some of the map_delete_elem()
+ * implementation may have irq disabled, so
+ * we need to use the _irqsave() version instead
+ * of the _bh() version.
+ */
+ spin_lock_irqsave(&btf_idr_lock, flags);
+ idr_remove(&btf_idr, btf->id);
+ spin_unlock_irqrestore(&btf_idr_lock, flags);
+}
+
+static void btf_free(struct btf *btf)
+{
+ kvfree(btf->types);
+ kvfree(btf->resolved_sizes);
+ kvfree(btf->resolved_ids);
+ kvfree(btf->data);
+ kfree(btf);
+}
+
+static void btf_free_rcu(struct rcu_head *rcu)
+{
+ struct btf *btf = container_of(rcu, struct btf, rcu);
+
+ btf_free(btf);
+}
+
+void btf_put(struct btf *btf)
+{
+ if (btf && refcount_dec_and_test(&btf->refcnt)) {
+ btf_free_id(btf);
+ call_rcu(&btf->rcu, btf_free_rcu);
+ }
+}
+
+static int env_resolve_init(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ u32 nr_types = btf->nr_types;
+ u32 *resolved_sizes = NULL;
+ u32 *resolved_ids = NULL;
+ u8 *visit_states = NULL;
+
+ /* +1 for btf_void */
+ resolved_sizes = kvzalloc((nr_types + 1) * sizeof(*resolved_sizes),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!resolved_sizes)
+ goto nomem;
+
+ resolved_ids = kvzalloc((nr_types + 1) * sizeof(*resolved_ids),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!resolved_ids)
+ goto nomem;
+
+ visit_states = kvzalloc((nr_types + 1) * sizeof(*visit_states),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!visit_states)
+ goto nomem;
+
+ btf->resolved_sizes = resolved_sizes;
+ btf->resolved_ids = resolved_ids;
+ env->visit_states = visit_states;
+
+ return 0;
+
+nomem:
+ kvfree(resolved_sizes);
+ kvfree(resolved_ids);
+ kvfree(visit_states);
+ return -ENOMEM;
+}
+
+static void btf_verifier_env_free(struct btf_verifier_env *env)
+{
+ kvfree(env->visit_states);
+ kfree(env);
+}
+
+static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
+ const struct btf_type *next_type)
+{
+ switch (env->resolve_mode) {
+ case RESOLVE_TBD:
+ /* int, enum or void is a sink */
+ return !btf_type_needs_resolve(next_type);
+ case RESOLVE_PTR:
+ /* int, enum, void, struct or array is a sink for ptr */
+ return !btf_type_is_modifier(next_type) &&
+ !btf_type_is_ptr(next_type);
+ case RESOLVE_STRUCT_OR_ARRAY:
+ /* int, enum, void or ptr is a sink for struct and array */
+ return !btf_type_is_modifier(next_type) &&
+ !btf_type_is_array(next_type) &&
+ !btf_type_is_struct(next_type);
+ default:
+ BUG();
+ }
+}
+
+static bool env_type_is_resolved(const struct btf_verifier_env *env,
+ u32 type_id)
+{
+ return env->visit_states[type_id] == RESOLVED;
+}
+
+static int env_stack_push(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ struct resolve_vertex *v;
+
+ if (env->top_stack == MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+
+ if (env->visit_states[type_id] != NOT_VISITED)
+ return -EEXIST;
+
+ env->visit_states[type_id] = VISITED;
+
+ v = &env->stack[env->top_stack++];
+ v->t = t;
+ v->type_id = type_id;
+ v->next_member = 0;
+
+ if (env->resolve_mode == RESOLVE_TBD) {
+ if (btf_type_is_ptr(t))
+ env->resolve_mode = RESOLVE_PTR;
+ else if (btf_type_is_struct(t) || btf_type_is_array(t))
+ env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY;
+ }
+
+ return 0;
+}
+
+static void env_stack_set_next_member(struct btf_verifier_env *env,
+ u16 next_member)
+{
+ env->stack[env->top_stack - 1].next_member = next_member;
+}
+
+static void env_stack_pop_resolved(struct btf_verifier_env *env,
+ u32 resolved_type_id,
+ u32 resolved_size)
+{
+ u32 type_id = env->stack[--(env->top_stack)].type_id;
+ struct btf *btf = env->btf;
+
+ btf->resolved_sizes[type_id] = resolved_size;
+ btf->resolved_ids[type_id] = resolved_type_id;
+ env->visit_states[type_id] = RESOLVED;
+}
+
+static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
+{
+ return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
+}
+
+/* The input param "type_id" must point to a needs_resolve type */
+static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
+ u32 *type_id)
+{
+ *type_id = btf->resolved_ids[*type_id];
+ return btf_type_by_id(btf, *type_id);
+}
+
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+ u32 *type_id, u32 *ret_size)
+{
+ const struct btf_type *size_type;
+ u32 size_type_id = *type_id;
+ u32 size = 0;
+
+ size_type = btf_type_by_id(btf, size_type_id);
+ if (btf_type_is_void_or_null(size_type))
+ return NULL;
+
+ if (btf_type_has_size(size_type)) {
+ size = size_type->size;
+ } else if (btf_type_is_array(size_type)) {
+ size = btf->resolved_sizes[size_type_id];
+ } else if (btf_type_is_ptr(size_type)) {
+ size = sizeof(void *);
+ } else {
+ if (WARN_ON_ONCE(!btf_type_is_modifier(size_type)))
+ return NULL;
+
+ size = btf->resolved_sizes[size_type_id];
+ size_type_id = btf->resolved_ids[size_type_id];
+ size_type = btf_type_by_id(btf, size_type_id);
+ if (btf_type_is_void(size_type))
+ return NULL;
+ }
+
+ *type_id = size_type_id;
+ if (ret_size)
+ *ret_size = size;
+
+ return size_type;
+}
+
+static int btf_df_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ btf_verifier_log_basic(env, struct_type,
+ "Unsupported check_member");
+ return -EINVAL;
+}
+
+static int btf_df_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ btf_verifier_log_basic(env, v->t, "Unsupported resolve");
+ return -EINVAL;
+}
+
+static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct seq_file *m)
+{
+ seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+}
+
+static int btf_int_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 int_data = btf_type_int(member_type);
+ u32 struct_bits_off = member->offset;
+ u32 struct_size = struct_type->size;
+ u32 nr_copy_bits;
+ u32 bytes_offset;
+
+ if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "bits_offset exceeds U32_MAX");
+ return -EINVAL;
+ }
+
+ struct_bits_off += BTF_INT_OFFSET(int_data);
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ nr_copy_bits = BTF_INT_BITS(int_data) +
+ BITS_PER_BYTE_MASKED(struct_bits_off);
+
+ if (nr_copy_bits > BITS_PER_U64) {
+ btf_verifier_log_member(env, struct_type, member,
+ "nr_copy_bits exceeds 64");
+ return -EINVAL;
+ }
+
+ if (struct_size < bytes_offset ||
+ struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_int_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 int_data, nr_bits, meta_needed = sizeof(int_data);
+ u16 encoding;
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ int_data = btf_type_int(t);
+ if (int_data & ~BTF_INT_MASK) {
+ btf_verifier_log_basic(env, t, "Invalid int_data:%x",
+ int_data);
+ return -EINVAL;
+ }
+
+ nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
+
+ if (nr_bits > BITS_PER_U64) {
+ btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
+ BITS_PER_U64);
+ return -EINVAL;
+ }
+
+ if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) {
+ btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
+ return -EINVAL;
+ }
+
+ /*
+ * Only one of the encoding bits is allowed and it
+ * should be sufficient for the pretty print purpose (i.e. decoding).
+ * Multiple bits can be allowed later if it is found
+ * to be insufficient.
+ */
+ encoding = BTF_INT_ENCODING(int_data);
+ if (encoding &&
+ encoding != BTF_INT_SIGNED &&
+ encoding != BTF_INT_CHAR &&
+ encoding != BTF_INT_BOOL) {
+ btf_verifier_log_type(env, t, "Unsupported encoding");
+ return -ENOTSUPP;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_int_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ int int_data = btf_type_int(t);
+
+ btf_verifier_log(env,
+ "size=%u bits_offset=%u nr_bits=%u encoding=%s",
+ t->size, BTF_INT_OFFSET(int_data),
+ BTF_INT_BITS(int_data),
+ btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
+}
+
+static void btf_int_bits_seq_show(const struct btf *btf,
+ const struct btf_type *t,
+ void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ u32 int_data = btf_type_int(t);
+ u16 nr_bits = BTF_INT_BITS(int_data);
+ u16 total_bits_offset;
+ u16 nr_copy_bytes;
+ u16 nr_copy_bits;
+ u8 nr_upper_bits;
+ union {
+ u64 u64_num;
+ u8 u8_nums[8];
+ } print_num;
+
+ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+ data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
+ bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+ nr_copy_bits = nr_bits + bits_offset;
+ nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
+
+ print_num.u64_num = 0;
+ memcpy(&print_num.u64_num, data, nr_copy_bytes);
+
+ /* Ditch the higher order bits */
+ nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits);
+ if (nr_upper_bits) {
+ /* We need to mask out some bits of the upper byte. */
+ u8 mask = (1 << nr_upper_bits) - 1;
+
+ print_num.u8_nums[nr_copy_bytes - 1] &= mask;
+ }
+
+ print_num.u64_num >>= bits_offset;
+
+ seq_printf(m, "0x%llx", print_num.u64_num);
+}
+
+static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ u32 int_data = btf_type_int(t);
+ u8 encoding = BTF_INT_ENCODING(int_data);
+ bool sign = encoding & BTF_INT_SIGNED;
+ u32 nr_bits = BTF_INT_BITS(int_data);
+
+ if (bits_offset || BTF_INT_OFFSET(int_data) ||
+ BITS_PER_BYTE_MASKED(nr_bits)) {
+ btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+ return;
+ }
+
+ switch (nr_bits) {
+ case 64:
+ if (sign)
+ seq_printf(m, "%lld", *(s64 *)data);
+ else
+ seq_printf(m, "%llu", *(u64 *)data);
+ break;
+ case 32:
+ if (sign)
+ seq_printf(m, "%d", *(s32 *)data);
+ else
+ seq_printf(m, "%u", *(u32 *)data);
+ break;
+ case 16:
+ if (sign)
+ seq_printf(m, "%d", *(s16 *)data);
+ else
+ seq_printf(m, "%u", *(u16 *)data);
+ break;
+ case 8:
+ if (sign)
+ seq_printf(m, "%d", *(s8 *)data);
+ else
+ seq_printf(m, "%u", *(u8 *)data);
+ break;
+ default:
+ btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+ }
+}
+
+static const struct btf_kind_operations int_ops = {
+ .check_meta = btf_int_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_int_check_member,
+ .log_details = btf_int_log,
+ .seq_show = btf_int_seq_show,
+};
+
+static int btf_modifier_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id = member->type;
+ struct btf_member resolved_member;
+ struct btf *btf = env->btf;
+
+ resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+ if (!resolved_type) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ resolved_member = *member;
+ resolved_member.type = resolved_type_id;
+
+ return btf_type_ops(resolved_type)->check_member(env, struct_type,
+ &resolved_member,
+ resolved_type);
+}
+
+static int btf_ptr_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_size, struct_bits_off, bytes_offset;
+
+ struct_size = struct_type->size;
+ struct_bits_off = member->offset;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ if (struct_size - bytes_offset < sizeof(void *)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_ref_type_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (!BTF_TYPE_ID_VALID(t->type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static int btf_modifier_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ const struct btf_type *next_type;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ u32 next_type_size = 0;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ /* "typedef void new_void", "const void"...etc */
+ if (btf_type_is_void(next_type))
+ goto resolved;
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ /* Figure out the resolved next_type_id with size.
+ * They will be stored in the current modifier's
+ * resolved_ids and resolved_sizes such that it can
+ * save us a few type-following when we use it later (e.g. in
+ * pretty print).
+ */
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+ !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+resolved:
+ env_stack_pop_resolved(env, next_type_id, next_type_size);
+
+ return 0;
+}
+
+static int btf_ptr_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ u32 next_type_size = 0;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ /* "void *" */
+ if (btf_type_is_void(next_type))
+ goto resolved;
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY,
+ * the modifier may have stopped resolving when it was resolved
+ * to a ptr (last-resolved-ptr).
+ *
+ * We now need to continue from the last-resolved-ptr to
+ * ensure the last-resolved-ptr will not referring back to
+ * the currenct ptr (t).
+ */
+ if (btf_type_is_modifier(next_type)) {
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id;
+
+ resolved_type_id = next_type_id;
+ resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+ if (btf_type_is_ptr(resolved_type) &&
+ !env_type_is_resolve_sink(env, resolved_type) &&
+ !env_type_is_resolved(env, resolved_type_id))
+ return env_stack_push(env, resolved_type,
+ resolved_type_id);
+ }
+
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+ !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+resolved:
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
+static void btf_modifier_seq_show(const struct btf *btf,
+ const struct btf_type *t,
+ u32 type_id, void *data,
+ u8 bits_offset, struct seq_file *m)
+{
+ t = btf_type_id_resolve(btf, &type_id);
+
+ btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+}
+
+static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ /* It is a hashed value */
+ seq_printf(m, "%p", *(void **)data);
+}
+
+static void btf_ref_type_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "type_id=%u", t->type);
+}
+
+static struct btf_kind_operations modifier_ops = {
+ .check_meta = btf_ref_type_check_meta,
+ .resolve = btf_modifier_resolve,
+ .check_member = btf_modifier_check_member,
+ .log_details = btf_ref_type_log,
+ .seq_show = btf_modifier_seq_show,
+};
+
+static struct btf_kind_operations ptr_ops = {
+ .check_meta = btf_ref_type_check_meta,
+ .resolve = btf_ptr_resolve,
+ .check_member = btf_ptr_check_member,
+ .log_details = btf_ref_type_log,
+ .seq_show = btf_ptr_seq_show,
+};
+
+static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (t->type) {
+ btf_verifier_log_type(env, t, "type != 0");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static struct btf_kind_operations fwd_ops = {
+ .check_meta = btf_fwd_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_df_check_member,
+ .log_details = btf_ref_type_log,
+ .seq_show = btf_df_seq_show,
+};
+
+static int btf_array_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+ u32 array_type_id, array_size;
+ struct btf *btf = env->btf;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ array_type_id = member->type;
+ btf_type_id_size(btf, &array_type_id, &array_size);
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < array_size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_array_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_array *array = btf_type_array(t);
+ u32 meta_needed = sizeof(*array);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (t->size) {
+ btf_verifier_log_type(env, t, "size != 0");
+ return -EINVAL;
+ }
+
+ /* Array elem type and index type cannot be in type void,
+ * so !array->type and !array->index_type are not allowed.
+ */
+ if (!array->type || !BTF_TYPE_ID_VALID(array->type)) {
+ btf_verifier_log_type(env, t, "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) {
+ btf_verifier_log_type(env, t, "Invalid index");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static int btf_array_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_array *array = btf_type_array(v->t);
+ const struct btf_type *elem_type, *index_type;
+ u32 elem_type_id, index_type_id;
+ struct btf *btf = env->btf;
+ u32 elem_size;
+
+ /* Check array->index_type */
+ index_type_id = array->index_type;
+ index_type = btf_type_by_id(btf, index_type_id);
+ if (btf_type_is_void_or_null(index_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid index");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, index_type) &&
+ !env_type_is_resolved(env, index_type_id))
+ return env_stack_push(env, index_type, index_type_id);
+
+ index_type = btf_type_id_size(btf, &index_type_id, NULL);
+ if (!index_type || !btf_type_is_int(index_type) ||
+ !btf_type_int_is_regular(index_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid index");
+ return -EINVAL;
+ }
+
+ /* Check array->type */
+ elem_type_id = array->type;
+ elem_type = btf_type_by_id(btf, elem_type_id);
+ if (btf_type_is_void_or_null(elem_type)) {
+ btf_verifier_log_type(env, v->t,
+ "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, elem_type) &&
+ !env_type_is_resolved(env, elem_type_id))
+ return env_stack_push(env, elem_type, elem_type_id);
+
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ if (!elem_type) {
+ btf_verifier_log_type(env, v->t, "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid array of int");
+ return -EINVAL;
+ }
+
+ if (array->nelems && elem_size > U32_MAX / array->nelems) {
+ btf_verifier_log_type(env, v->t,
+ "Array size overflows U32_MAX");
+ return -EINVAL;
+ }
+
+ env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems);
+
+ return 0;
+}
+
+static void btf_array_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_array *array = btf_type_array(t);
+
+ btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u",
+ array->type, array->index_type, array->nelems);
+}
+
+static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const struct btf_array *array = btf_type_array(t);
+ const struct btf_kind_operations *elem_ops;
+ const struct btf_type *elem_type;
+ u32 i, elem_size, elem_type_id;
+
+ elem_type_id = array->type;
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ elem_ops = btf_type_ops(elem_type);
+ seq_puts(m, "[");
+ for (i = 0; i < array->nelems; i++) {
+ if (i)
+ seq_puts(m, ",");
+
+ elem_ops->seq_show(btf, elem_type, elem_type_id, data,
+ bits_offset, m);
+ data += elem_size;
+ }
+ seq_puts(m, "]");
+}
+
+static struct btf_kind_operations array_ops = {
+ .check_meta = btf_array_check_meta,
+ .resolve = btf_array_resolve,
+ .check_member = btf_array_check_member,
+ .log_details = btf_array_log,
+ .seq_show = btf_array_seq_show,
+};
+
+static int btf_struct_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < member_type->size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_struct_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
+ const struct btf_member *member;
+ struct btf *btf = env->btf;
+ u32 struct_size = t->size;
+ u32 meta_needed;
+ u16 i;
+
+ meta_needed = btf_type_vlen(t) * sizeof(*member);
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for_each_member(i, t, member) {
+ if (!btf_name_offset_valid(btf, member->name_off)) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid member name_offset:%u",
+ member->name_off);
+ return -EINVAL;
+ }
+
+ /* A member cannot be in type void */
+ if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (is_union && member->offset) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid member bits_offset");
+ return -EINVAL;
+ }
+
+ if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+ btf_verifier_log_member(env, t, member,
+ "Memmber bits_offset exceeds its struct size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_member(env, t, member, NULL);
+ }
+
+ return meta_needed;
+}
+
+static int btf_struct_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_member *member;
+ int err;
+ u16 i;
+
+ /* Before continue resolving the next_member,
+ * ensure the last member is indeed resolved to a
+ * type with size info.
+ */
+ if (v->next_member) {
+ const struct btf_type *last_member_type;
+ const struct btf_member *last_member;
+ u16 last_member_type_id;
+
+ last_member = btf_type_member(v->t) + v->next_member - 1;
+ last_member_type_id = last_member->type;
+ if (WARN_ON_ONCE(!env_type_is_resolved(env,
+ last_member_type_id)))
+ return -EINVAL;
+
+ last_member_type = btf_type_by_id(env->btf,
+ last_member_type_id);
+ err = btf_type_ops(last_member_type)->check_member(env, v->t,
+ last_member,
+ last_member_type);
+ if (err)
+ return err;
+ }
+
+ for_each_member_from(i, v->next_member, v->t, member) {
+ u32 member_type_id = member->type;
+ const struct btf_type *member_type = btf_type_by_id(env->btf,
+ member_type_id);
+
+ if (btf_type_is_void_or_null(member_type)) {
+ btf_verifier_log_member(env, v->t, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, member_type) &&
+ !env_type_is_resolved(env, member_type_id)) {
+ env_stack_set_next_member(env, i + 1);
+ return env_stack_push(env, member_type, member_type_id);
+ }
+
+ err = btf_type_ops(member_type)->check_member(env, v->t,
+ member,
+ member_type);
+ if (err)
+ return err;
+ }
+
+ env_stack_pop_resolved(env, 0, 0);
+
+ return 0;
+}
+
+static void btf_struct_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ",";
+ const struct btf_member *member;
+ u32 i;
+
+ seq_puts(m, "{");
+ for_each_member(i, t, member) {
+ const struct btf_type *member_type = btf_type_by_id(btf,
+ member->type);
+ u32 member_offset = member->offset;
+ u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+ u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+ const struct btf_kind_operations *ops;
+
+ if (i)
+ seq_puts(m, seq);
+
+ ops = btf_type_ops(member_type);
+ ops->seq_show(btf, member_type, member->type,
+ data + bytes_offset, bits8_offset, m);
+ }
+ seq_puts(m, "}");
+}
+
+static struct btf_kind_operations struct_ops = {
+ .check_meta = btf_struct_check_meta,
+ .resolve = btf_struct_resolve,
+ .check_member = btf_struct_check_member,
+ .log_details = btf_struct_log,
+ .seq_show = btf_struct_seq_show,
+};
+
+static int btf_enum_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < sizeof(int)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_enum_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_enum *enums = btf_type_enum(t);
+ struct btf *btf = env->btf;
+ u16 i, nr_enums;
+ u32 meta_needed;
+
+ nr_enums = btf_type_vlen(t);
+ meta_needed = nr_enums * sizeof(*enums);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->size != sizeof(int)) {
+ btf_verifier_log_type(env, t, "Expected size:%zu",
+ sizeof(int));
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for (i = 0; i < nr_enums; i++) {
+ if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+ btf_verifier_log(env, "\tInvalid name_offset:%u",
+ enums[i].name_off);
+ return -EINVAL;
+ }
+
+ btf_verifier_log(env, "\t%s val=%d\n",
+ btf_name_by_offset(btf, enums[i].name_off),
+ enums[i].val);
+ }
+
+ return meta_needed;
+}
+
+static void btf_enum_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const struct btf_enum *enums = btf_type_enum(t);
+ u32 i, nr_enums = btf_type_vlen(t);
+ int v = *(int *)data;
+
+ for (i = 0; i < nr_enums; i++) {
+ if (v == enums[i].val) {
+ seq_printf(m, "%s",
+ btf_name_by_offset(btf, enums[i].name_off));
+ return;
+ }
+ }
+
+ seq_printf(m, "%d", v);
+}
+
+static struct btf_kind_operations enum_ops = {
+ .check_meta = btf_enum_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_enum_check_member,
+ .log_details = btf_enum_log,
+ .seq_show = btf_enum_seq_show,
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
+ [BTF_KIND_INT] = &int_ops,
+ [BTF_KIND_PTR] = &ptr_ops,
+ [BTF_KIND_ARRAY] = &array_ops,
+ [BTF_KIND_STRUCT] = &struct_ops,
+ [BTF_KIND_UNION] = &struct_ops,
+ [BTF_KIND_ENUM] = &enum_ops,
+ [BTF_KIND_FWD] = &fwd_ops,
+ [BTF_KIND_TYPEDEF] = &modifier_ops,
+ [BTF_KIND_VOLATILE] = &modifier_ops,
+ [BTF_KIND_CONST] = &modifier_ops,
+ [BTF_KIND_RESTRICT] = &modifier_ops,
+};
+
+static s32 btf_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 saved_meta_left = meta_left;
+ s32 var_meta_size;
+
+ if (meta_left < sizeof(*t)) {
+ btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu",
+ env->log_type_id, meta_left, sizeof(*t));
+ return -EINVAL;
+ }
+ meta_left -= sizeof(*t);
+
+ if (t->info & ~BTF_INFO_MASK) {
+ btf_verifier_log(env, "[%u] Invalid btf_info:%x",
+ env->log_type_id, t->info);
+ return -EINVAL;
+ }
+
+ if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
+ BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
+ btf_verifier_log(env, "[%u] Invalid kind:%u",
+ env->log_type_id, BTF_INFO_KIND(t->info));
+ return -EINVAL;
+ }
+
+ if (!btf_name_offset_valid(env->btf, t->name_off)) {
+ btf_verifier_log(env, "[%u] Invalid name_offset:%u",
+ env->log_type_id, t->name_off);
+ return -EINVAL;
+ }
+
+ var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left);
+ if (var_meta_size < 0)
+ return var_meta_size;
+
+ meta_left -= var_meta_size;
+
+ return saved_meta_left - meta_left;
+}
+
+static int btf_check_all_metas(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ struct btf_header *hdr;
+ void *cur, *end;
+
+ hdr = &btf->hdr;
+ cur = btf->nohdr_data + hdr->type_off;
+ end = btf->nohdr_data + hdr->type_len;
+
+ env->log_type_id = 1;
+ while (cur < end) {
+ struct btf_type *t = cur;
+ s32 meta_size;
+
+ meta_size = btf_check_meta(env, t, end - cur);
+ if (meta_size < 0)
+ return meta_size;
+
+ btf_add_type(env, t);
+ cur += meta_size;
+ env->log_type_id++;
+ }
+
+ return 0;
+}
+
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ const struct resolve_vertex *v;
+ int err = 0;
+
+ env->resolve_mode = RESOLVE_TBD;
+ env_stack_push(env, t, type_id);
+ while (!err && (v = env_stack_peak(env))) {
+ env->log_type_id = v->type_id;
+ err = btf_type_ops(v->t)->resolve(env, v);
+ }
+
+ env->log_type_id = type_id;
+ if (err == -E2BIG)
+ btf_verifier_log_type(env, t,
+ "Exceeded max resolving depth:%u",
+ MAX_RESOLVE_DEPTH);
+ else if (err == -EEXIST)
+ btf_verifier_log_type(env, t, "Loop detected");
+
+ return err;
+}
+
+static bool btf_resolve_valid(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 type_id)
+{
+ struct btf *btf = env->btf;
+
+ if (!env_type_is_resolved(env, type_id))
+ return false;
+
+ if (btf_type_is_struct(t))
+ return !btf->resolved_ids[type_id] &&
+ !btf->resolved_sizes[type_id];
+
+ if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) {
+ t = btf_type_id_resolve(btf, &type_id);
+ return t && !btf_type_is_modifier(t);
+ }
+
+ if (btf_type_is_array(t)) {
+ const struct btf_array *array = btf_type_array(t);
+ const struct btf_type *elem_type;
+ u32 elem_type_id = array->type;
+ u32 elem_size;
+
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ return elem_type && !btf_type_is_modifier(elem_type) &&
+ (array->nelems * elem_size ==
+ btf->resolved_sizes[type_id]);
+ }
+
+ return false;
+}
+
+static int btf_check_all_types(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ u32 type_id;
+ int err;
+
+ err = env_resolve_init(env);
+ if (err)
+ return err;
+
+ env->phase++;
+ for (type_id = 1; type_id <= btf->nr_types; type_id++) {
+ const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+ env->log_type_id = type_id;
+ if (btf_type_needs_resolve(t) &&
+ !env_type_is_resolved(env, type_id)) {
+ err = btf_resolve(env, t, type_id);
+ if (err)
+ return err;
+ }
+
+ if (btf_type_needs_resolve(t) &&
+ !btf_resolve_valid(env, t, type_id)) {
+ btf_verifier_log_type(env, t, "Invalid resolve state");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int btf_parse_type_sec(struct btf_verifier_env *env)
+{
+ const struct btf_header *hdr = &env->btf->hdr;
+ int err;
+
+ /* Type section must align to 4 bytes */
+ if (hdr->type_off & (sizeof(u32) - 1)) {
+ btf_verifier_log(env, "Unaligned type_off");
+ return -EINVAL;
+ }
+
+ if (!hdr->type_len) {
+ btf_verifier_log(env, "No type found");
+ return -EINVAL;
+ }
+
+ err = btf_check_all_metas(env);
+ if (err)
+ return err;
+
+ return btf_check_all_types(env);
+}
+
+static int btf_parse_str_sec(struct btf_verifier_env *env)
+{
+ const struct btf_header *hdr;
+ struct btf *btf = env->btf;
+ const char *start, *end;
+
+ hdr = &btf->hdr;
+ start = btf->nohdr_data + hdr->str_off;
+ end = start + hdr->str_len;
+
+ if (end != btf->data + btf->data_size) {
+ btf_verifier_log(env, "String section is not at the end");
+ return -EINVAL;
+ }
+
+ if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
+ start[0] || end[-1]) {
+ btf_verifier_log(env, "Invalid string section");
+ return -EINVAL;
+ }
+
+ btf->strings = start;
+
+ return 0;
+}
+
+static const size_t btf_sec_info_offset[] = {
+ offsetof(struct btf_header, type_off),
+ offsetof(struct btf_header, str_off),
+};
+
+static int btf_sec_info_cmp(const void *a, const void *b)
+{
+ const struct btf_sec_info *x = a;
+ const struct btf_sec_info *y = b;
+
+ return (int)(x->off - y->off) ? : (int)(x->len - y->len);
+}
+
+static int btf_check_sec_info(struct btf_verifier_env *env,
+ u32 btf_data_size)
+{
+ struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];
+ u32 total, expected_total, i;
+ const struct btf_header *hdr;
+ const struct btf *btf;
+
+ btf = env->btf;
+ hdr = &btf->hdr;
+
+ /* Populate the secs from hdr */
+ for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++)
+ secs[i] = *(struct btf_sec_info *)((void *)hdr +
+ btf_sec_info_offset[i]);
+
+ sort(secs, ARRAY_SIZE(btf_sec_info_offset),
+ sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL);
+
+ /* Check for gaps and overlap among sections */
+ total = 0;
+ expected_total = btf_data_size - hdr->hdr_len;
+ for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) {
+ if (expected_total < secs[i].off) {
+ btf_verifier_log(env, "Invalid section offset");
+ return -EINVAL;
+ }
+ if (total < secs[i].off) {
+ /* gap */
+ btf_verifier_log(env, "Unsupported section found");
+ return -EINVAL;
+ }
+ if (total > secs[i].off) {
+ btf_verifier_log(env, "Section overlap found");
+ return -EINVAL;
+ }
+ if (expected_total - total < secs[i].len) {
+ btf_verifier_log(env,
+ "Total section length too long");
+ return -EINVAL;
+ }
+ total += secs[i].len;
+ }
+
+ /* There is data other than hdr and known sections */
+ if (expected_total != total) {
+ btf_verifier_log(env, "Unsupported section found");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data,
+ u32 btf_data_size)
+{
+ const struct btf_header *hdr;
+ u32 hdr_len, hdr_copy;
+ /*
+ * Minimal part of the "struct btf_header" that
+ * contains the hdr_len.
+ */
+ struct btf_min_header {
+ u16 magic;
+ u8 version;
+ u8 flags;
+ u32 hdr_len;
+ } __user *min_hdr;
+ struct btf *btf;
+ int err;
+
+ btf = env->btf;
+ min_hdr = btf_data;
+
+ if (btf_data_size < sizeof(*min_hdr)) {
+ btf_verifier_log(env, "hdr_len not found");
+ return -EINVAL;
+ }
+
+ if (get_user(hdr_len, &min_hdr->hdr_len))
+ return -EFAULT;
+
+ if (btf_data_size < hdr_len) {
+ btf_verifier_log(env, "btf_header not found");
+ return -EINVAL;
+ }
+
+ err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len);
+ if (err) {
+ if (err == -E2BIG)
+ btf_verifier_log(env, "Unsupported btf_header");
+ return err;
+ }
+
+ hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
+ if (copy_from_user(&btf->hdr, btf_data, hdr_copy))
+ return -EFAULT;
+
+ hdr = &btf->hdr;
+
+ btf_verifier_log_hdr(env, btf_data_size);
+
+ if (hdr->magic != BTF_MAGIC) {
+ btf_verifier_log(env, "Invalid magic");
+ return -EINVAL;
+ }
+
+ if (hdr->version != BTF_VERSION) {
+ btf_verifier_log(env, "Unsupported version");
+ return -ENOTSUPP;
+ }
+
+ if (hdr->flags) {
+ btf_verifier_log(env, "Unsupported flags");
+ return -ENOTSUPP;
+ }
+
+ if (btf_data_size == hdr->hdr_len) {
+ btf_verifier_log(env, "No data");
+ return -EINVAL;
+ }
+
+ err = btf_check_sec_info(env, btf_data_size);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+ u32 log_level, char __user *log_ubuf, u32 log_size)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf = NULL;
+ u8 *data;
+ int err;
+
+ if (btf_data_size > BTF_MAX_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ if (log_level || log_ubuf || log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log->level = log_level;
+ log->ubuf = log_ubuf;
+ log->len_total = log_size;
+
+ /* log attributes have to be sane */
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+ !log->level || !log->ubuf) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
+ btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+ if (!btf) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ env->btf = btf;
+
+ err = btf_parse_hdr(env, btf_data, btf_data_size);
+ if (err)
+ goto errout;
+
+ data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!data) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ btf->data = data;
+ btf->data_size = btf_data_size;
+ btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+ if (copy_from_user(data, btf_data, btf_data_size)) {
+ err = -EFAULT;
+ goto errout;
+ }
+
+ err = btf_parse_str_sec(env);
+ if (err)
+ goto errout;
+
+ err = btf_parse_type_sec(env);
+ if (err)
+ goto errout;
+
+ if (log->level && bpf_verifier_log_full(log)) {
+ err = -ENOSPC;
+ goto errout;
+ }
+
+ btf_verifier_env_free(env);
+ refcount_set(&btf->refcnt, 1);
+ return btf;
+
+errout:
+ btf_verifier_env_free(env);
+ if (btf)
+ btf_free(btf);
+ return ERR_PTR(err);
+}
+
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+ struct seq_file *m)
+{
+ const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+ btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
+}
+
+static int btf_release(struct inode *inode, struct file *filp)
+{
+ btf_put(filp->private_data);
+ return 0;
+}
+
+const struct file_operations btf_fops = {
+ .release = btf_release,
+};
+
+static int __btf_new_fd(struct btf *btf)
+{
+ return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
+}
+
+int btf_new_fd(const union bpf_attr *attr)
+{
+ struct btf *btf;
+ int ret;
+
+ btf = btf_parse(u64_to_user_ptr(attr->btf),
+ attr->btf_size, attr->btf_log_level,
+ u64_to_user_ptr(attr->btf_log_buf),
+ attr->btf_log_size);
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ ret = btf_alloc_id(btf);
+ if (ret) {
+ btf_free(btf);
+ return ret;
+ }
+
+ /*
+ * The BTF ID is published to the userspace.
+ * All BTF free must go through call_rcu() from
+ * now on (i.e. free by calling btf_put()).
+ */
+
+ ret = __btf_new_fd(btf);
+ if (ret < 0)
+ btf_put(btf);
+
+ return ret;
+}
+
+struct btf *btf_get_by_fd(int fd)
+{
+ struct btf *btf;
+ struct fd f;
+
+ f = fdget(fd);
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &btf_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ btf = f.file->private_data;
+ refcount_inc(&btf->refcnt);
+ fdput(f);
+
+ return btf;
+}
+
+int btf_get_info_by_fd(const struct btf *btf,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_btf_info __user *uinfo;
+ struct bpf_btf_info info = {};
+ u32 info_copy, btf_copy;
+ void __user *ubtf;
+ u32 uinfo_len;
+
+ uinfo = u64_to_user_ptr(attr->info.info);
+ uinfo_len = attr->info.info_len;
+
+ info_copy = min_t(u32, uinfo_len, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_copy))
+ return -EFAULT;
+
+ info.id = btf->id;
+ ubtf = u64_to_user_ptr(info.btf);
+ btf_copy = min_t(u32, btf->data_size, info.btf_size);
+ if (copy_to_user(ubtf, btf->data, btf_copy))
+ return -EFAULT;
+ info.btf_size = btf->data_size;
+
+ if (copy_to_user(uinfo, &info, info_copy) ||
+ put_user(info_copy, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+int btf_get_fd_by_id(u32 id)
+{
+ struct btf *btf;
+ int fd;
+
+ rcu_read_lock();
+ btf = idr_find(&btf_idr, id);
+ if (!btf || !refcount_inc_not_zero(&btf->refcnt))
+ btf = ERR_PTR(-ENOENT);
+ rcu_read_unlock();
+
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ fd = __btf_new_fd(btf);
+ if (fd < 0)
+ btf_put(btf);
+
+ return fd;
+}
+
+u32 btf_id(const struct btf *btf)
+{
+ return btf->id;
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index c1c0b60d3f2f..f7c00bd6f8e4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -495,6 +495,51 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
/**
+ * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
+ * provided by user sockaddr
+ * @sk: sock struct that will use sockaddr
+ * @uaddr: sockaddr struct provided by user
+ * @type: The type of program to be exectuted
+ * @t_ctx: Pointer to attach type specific context
+ *
+ * socket is expected to be of type INET or INET6.
+ *
+ * This function will return %-EPERM if an attached program is found and
+ * returned value != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
+ struct sockaddr *uaddr,
+ enum bpf_attach_type type,
+ void *t_ctx)
+{
+ struct bpf_sock_addr_kern ctx = {
+ .sk = sk,
+ .uaddr = uaddr,
+ .t_ctx = t_ctx,
+ };
+ struct sockaddr_storage unspec;
+ struct cgroup *cgrp;
+ int ret;
+
+ /* Check socket family since not all sockets represent network
+ * endpoint (e.g. AF_UNIX).
+ */
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+ return 0;
+
+ if (!ctx.uaddr) {
+ memset(&unspec, 0, sizeof(unspec));
+ ctx.uaddr = (struct sockaddr *)&unspec;
+ }
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+
+ return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
+
+/**
* __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
* @sk: socket to get cgroup from
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
@@ -545,7 +590,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id)
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -566,6 +611,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id)
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
const int size_default = sizeof(__u32);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d315b393abdd..9f1493705f40 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
+#include <linux/perf_event.h>
#include <asm/unaligned.h>
@@ -218,47 +219,84 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
return 0;
}
-static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
+ u32 curr, const bool probe_pass)
{
+ const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+ s64 imm = insn->imm;
+
+ if (curr < pos && curr + imm + 1 > pos)
+ imm += delta;
+ else if (curr > pos + delta && curr + imm + 1 <= pos + delta)
+ imm -= delta;
+ if (imm < imm_min || imm > imm_max)
+ return -ERANGE;
+ if (!probe_pass)
+ insn->imm = imm;
+ return 0;
+}
+
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
+ u32 curr, const bool probe_pass)
+{
+ const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s32 off = insn->off;
+
+ if (curr < pos && curr + off + 1 > pos)
+ off += delta;
+ else if (curr > pos + delta && curr + off + 1 <= pos + delta)
+ off -= delta;
+ if (off < off_min || off > off_max)
+ return -ERANGE;
+ if (!probe_pass)
+ insn->off = off;
+ return 0;
+}
+
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
+ const bool probe_pass)
+{
+ u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0);
struct bpf_insn *insn = prog->insnsi;
- u32 i, insn_cnt = prog->len;
- bool pseudo_call;
- u8 code;
- int off;
+ int ret = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ u8 code;
+
+ /* In the probing pass we still operate on the original,
+ * unpatched image in order to check overflows before we
+ * do any other adjustments. Therefore skip the patchlet.
+ */
+ if (probe_pass && i == pos) {
+ i += delta + 1;
+ insn++;
+ }
code = insn->code;
- if (BPF_CLASS(code) != BPF_JMP)
- continue;
- if (BPF_OP(code) == BPF_EXIT)
+ if (BPF_CLASS(code) != BPF_JMP ||
+ BPF_OP(code) == BPF_EXIT)
continue;
+ /* Adjust offset of jmps if we cross patch boundaries. */
if (BPF_OP(code) == BPF_CALL) {
- if (insn->src_reg == BPF_PSEUDO_CALL)
- pseudo_call = true;
- else
+ if (insn->src_reg != BPF_PSEUDO_CALL)
continue;
+ ret = bpf_adj_delta_to_imm(insn, pos, delta, i,
+ probe_pass);
} else {
- pseudo_call = false;
+ ret = bpf_adj_delta_to_off(insn, pos, delta, i,
+ probe_pass);
}
- off = pseudo_call ? insn->imm : insn->off;
-
- /* Adjust offset of jmps if we cross boundaries. */
- if (i < pos && i + off + 1 > pos)
- off += delta;
- else if (i > pos + delta && i + off + 1 <= pos + delta)
- off -= delta;
-
- if (pseudo_call)
- insn->imm = off;
- else
- insn->off = off;
+ if (ret)
+ break;
}
+
+ return ret;
}
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len)
{
u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+ const u32 cnt_max = S16_MAX;
struct bpf_prog *prog_adj;
/* Since our patchlet doesn't expand the image, we're done. */
@@ -269,6 +307,15 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
insn_adj_cnt = prog->len + insn_delta;
+ /* Reject anything that would potentially let the insn->off
+ * target overflow when we have excessive program expansions.
+ * We need to probe here before we do any reallocation where
+ * we afterwards may not fail anymore.
+ */
+ if (insn_adj_cnt > cnt_max &&
+ bpf_adj_branches(prog, off, insn_delta, true))
+ return NULL;
+
/* Several new instructions need to be inserted. Make room
* for them. Likely, there's no need for a new allocation as
* last page could have large enough tailroom.
@@ -294,7 +341,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
sizeof(*patch) * insn_rest);
memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
- bpf_adj_branches(prog_adj, off, insn_delta);
+ /* We are guaranteed to not fail at this point, otherwise
+ * the ship has sailed to reverse to the original state. An
+ * overflow cannot happen at this point.
+ */
+ BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
return prog_adj;
}
@@ -633,23 +684,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
break;
- case BPF_LD | BPF_ABS | BPF_W:
- case BPF_LD | BPF_ABS | BPF_H:
- case BPF_LD | BPF_ABS | BPF_B:
- *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
- *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
- break;
-
- case BPF_LD | BPF_IND | BPF_W:
- case BPF_LD | BPF_IND | BPF_H:
- case BPF_LD | BPF_IND | BPF_B:
- *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
- *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
- *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
- break;
-
case BPF_LD | BPF_IMM | BPF_DW:
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
@@ -890,14 +924,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(LDX, MEM, W), \
INSN_3(LDX, MEM, DW), \
/* Immediate based. */ \
- INSN_3(LD, IMM, DW), \
- /* Misc (old cBPF carry-over). */ \
- INSN_3(LD, ABS, B), \
- INSN_3(LD, ABS, H), \
- INSN_3(LD, ABS, W), \
- INSN_3(LD, IND, B), \
- INSN_3(LD, IND, H), \
- INSN_3(LD, IND, W)
+ INSN_3(LD, IMM, DW)
bool bpf_opcode_in_insntable(u8 code)
{
@@ -907,6 +934,13 @@ bool bpf_opcode_in_insntable(u8 code)
[0 ... 255] = false,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
+ /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
+ [BPF_LD | BPF_ABS | BPF_B] = true,
+ [BPF_LD | BPF_ABS | BPF_H] = true,
+ [BPF_LD | BPF_ABS | BPF_W] = true,
+ [BPF_LD | BPF_IND | BPF_B] = true,
+ [BPF_LD | BPF_IND | BPF_H] = true,
+ [BPF_LD | BPF_IND | BPF_W] = true,
};
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
@@ -937,8 +971,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
u32 tail_call_cnt = 0;
- void *ptr;
- int off;
#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })
@@ -1265,67 +1297,6 @@ out:
atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
(DST + insn->off));
CONT;
- LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
- off = IMM;
-load_word:
- /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
- * appearing in the programs where ctx == skb
- * (see may_access_skb() in the verifier). All programs
- * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
- * bpf_convert_filter() saves it in BPF_R6, internal BPF
- * verifier will check that BPF_R6 == ctx.
- *
- * BPF_ABS and BPF_IND are wrappers of function calls,
- * so they scratch BPF_R1-BPF_R5 registers, preserve
- * BPF_R6-BPF_R9, and store return value into BPF_R0.
- *
- * Implicit input:
- * ctx == skb == BPF_R6 == CTX
- *
- * Explicit input:
- * SRC == any register
- * IMM == 32-bit immediate
- *
- * Output:
- * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
- */
-
- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = get_unaligned_be32(ptr);
- CONT;
- }
-
- return 0;
- LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
- off = IMM;
-load_half:
- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = get_unaligned_be16(ptr);
- CONT;
- }
-
- return 0;
- LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
- off = IMM;
-load_byte:
- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = *(u8 *)ptr;
- CONT;
- }
-
- return 0;
- LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
- off = IMM + SRC;
- goto load_word;
- LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
- off = IMM + SRC;
- goto load_half;
- LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
- off = IMM + SRC;
- goto load_byte;
default_label:
/* If we ever reach this, we have a bug somewhere. Die hard here
@@ -1572,13 +1543,32 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
return cnt;
}
+static bool bpf_prog_array_copy_core(struct bpf_prog **prog,
+ u32 *prog_ids,
+ u32 request_cnt)
+{
+ int i = 0;
+
+ for (; *prog; prog++) {
+ if (*prog == &dummy_bpf_prog.prog)
+ continue;
+ prog_ids[i] = (*prog)->aux->id;
+ if (++i == request_cnt) {
+ prog++;
+ break;
+ }
+ }
+
+ return !!(*prog);
+}
+
int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
__u32 __user *prog_ids, u32 cnt)
{
struct bpf_prog **prog;
unsigned long err = 0;
- u32 i = 0, *ids;
bool nospc;
+ u32 *ids;
/* users of this function are doing:
* cnt = bpf_prog_array_length();
@@ -1595,16 +1585,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
return -ENOMEM;
rcu_read_lock();
prog = rcu_dereference(progs)->progs;
- for (; *prog; prog++) {
- if (*prog == &dummy_bpf_prog.prog)
- continue;
- ids[i] = (*prog)->aux->id;
- if (++i == cnt) {
- prog++;
- break;
- }
- }
- nospc = !!(*prog);
+ nospc = bpf_prog_array_copy_core(prog, ids, cnt);
rcu_read_unlock();
err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
kfree(ids);
@@ -1635,6 +1616,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
int new_prog_cnt, carry_prog_cnt = 0;
struct bpf_prog **existing_prog;
struct bpf_prog_array *array;
+ bool found_exclude = false;
int new_prog_idx = 0;
/* Figure out how many existing progs we need to carry over to
@@ -1643,14 +1625,20 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
if (old_array) {
existing_prog = old_array->progs;
for (; *existing_prog; existing_prog++) {
- if (*existing_prog != exclude_prog &&
- *existing_prog != &dummy_bpf_prog.prog)
+ if (*existing_prog == exclude_prog) {
+ found_exclude = true;
+ continue;
+ }
+ if (*existing_prog != &dummy_bpf_prog.prog)
carry_prog_cnt++;
if (*existing_prog == include_prog)
return -EEXIST;
}
}
+ if (exclude_prog && !found_exclude)
+ return -ENOENT;
+
/* How many progs (not NULL) will be in the new array? */
new_prog_cnt = carry_prog_cnt;
if (include_prog)
@@ -1683,22 +1671,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
}
int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
- __u32 __user *prog_ids, u32 request_cnt,
- __u32 __user *prog_cnt)
+ u32 *prog_ids, u32 request_cnt,
+ u32 *prog_cnt)
{
+ struct bpf_prog **prog;
u32 cnt = 0;
if (array)
cnt = bpf_prog_array_length(array);
- if (copy_to_user(prog_cnt, &cnt, sizeof(cnt)))
- return -EFAULT;
+ *prog_cnt = cnt;
/* return early if user requested only program count or nothing to copy */
if (!request_cnt || !cnt)
return 0;
- return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt);
+ /* this function is called under trace/bpf_trace.c: bpf_event_mutex */
+ prog = rcu_dereference_check(array, 1)->progs;
+ return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC
+ : 0;
}
static void bpf_prog_free_deferred(struct work_struct *work)
@@ -1709,6 +1700,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
aux = container_of(work, struct bpf_prog_aux, work);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_offload_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+ if (aux->prog->has_callchain_buf)
+ put_callchain_buffers();
+#endif
for (i = 0; i < aux->func_cnt; i++)
bpf_jit_free(aux->func[i]);
if (aux->func_cnt) {
@@ -1769,6 +1764,8 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
@@ -1781,6 +1778,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
{
return -ENOTSUPP;
}
+EXPORT_SYMBOL_GPL(bpf_event_output);
/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
@@ -1827,9 +1825,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
#include <linux/bpf_trace.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
-
-/* These are only used within the BPF_SYSCALL code */
-#ifdef CONFIG_BPF_SYSCALL
-EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
-#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a4bb0b34375a..e0918d180f08 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -19,6 +19,7 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ptr_ring.h>
+#include <net/xdp.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
@@ -137,27 +138,6 @@ free_cmap:
return ERR_PTR(err);
}
-static void __cpu_map_queue_destructor(void *ptr)
-{
- /* The tear-down procedure should have made sure that queue is
- * empty. See __cpu_map_entry_replace() and work-queue
- * invoked cpu_map_kthread_stop(). Catch any broken behaviour
- * gracefully and warn once.
- */
- if (WARN_ON_ONCE(ptr))
- page_frag_free(ptr);
-}
-
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- if (atomic_dec_and_test(&rcpu->refcnt)) {
- /* The queue should be empty at this point */
- ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
- kfree(rcpu->queue);
- kfree(rcpu);
- }
-}
-
static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
{
atomic_inc(&rcpu->refcnt);
@@ -179,45 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work)
kthread_stop(rcpu->kthread);
}
-/* For now, xdp_pkt is a cpumap internal data structure, with info
- * carried between enqueue to dequeue. It is mapped into the top
- * headroom of the packet, to avoid allocating separate mem.
- */
-struct xdp_pkt {
- void *data;
- u16 len;
- u16 headroom;
- u16 metasize;
- struct net_device *dev_rx;
-};
-
-/* Convert xdp_buff to xdp_pkt */
-static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
-{
- struct xdp_pkt *xdp_pkt;
- int metasize;
- int headroom;
-
- /* Assure headroom is available for storing info */
- headroom = xdp->data - xdp->data_hard_start;
- metasize = xdp->data - xdp->data_meta;
- metasize = metasize > 0 ? metasize : 0;
- if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
- return NULL;
-
- /* Store info in top of packet */
- xdp_pkt = xdp->data_hard_start;
-
- xdp_pkt->data = xdp->data;
- xdp_pkt->len = xdp->data_end - xdp->data;
- xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
- xdp_pkt->metasize = metasize;
-
- return xdp_pkt;
-}
-
static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
- struct xdp_pkt *xdp_pkt)
+ struct xdp_frame *xdpf)
{
unsigned int frame_size;
void *pkt_data_start;
@@ -232,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* would be preferred to set frame_size to 2048 or 4096
* depending on the driver.
* frame_size = 2048;
- * frame_len = frame_size - sizeof(*xdp_pkt);
+ * frame_len = frame_size - sizeof(*xdp_frame);
*
* Instead, with info avail, skb_shared_info in placed after
* packet len. This, unfortunately fakes the truesize.
@@ -240,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* is not at a fixed memory location, with mixed length
* packets, which is bad for cache-line hotness.
*/
- frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+ frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+ pkt_data_start = xdpf->data - xdpf->headroom;
skb = build_skb(pkt_data_start, frame_size);
if (!skb)
return NULL;
- skb_reserve(skb, xdp_pkt->headroom);
- __skb_put(skb, xdp_pkt->len);
- if (xdp_pkt->metasize)
- skb_metadata_set(skb, xdp_pkt->metasize);
+ skb_reserve(skb, xdpf->headroom);
+ __skb_put(skb, xdpf->len);
+ if (xdpf->metasize)
+ skb_metadata_set(skb, xdpf->metasize);
/* Essential SKB info: protocol and skb->dev */
- skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+ skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
/* Optional SKB info, currently missing:
* - HW checksum info (skb->ip_summed)
@@ -265,6 +208,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
return skb;
}
+static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
+{
+ /* The tear-down procedure should have made sure that queue is
+ * empty. See __cpu_map_entry_replace() and work-queue
+ * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+ * gracefully and warn once.
+ */
+ struct xdp_frame *xdpf;
+
+ while ((xdpf = ptr_ring_consume(ring)))
+ if (WARN_ON_ONCE(xdpf))
+ xdp_return_frame(xdpf);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+ if (atomic_dec_and_test(&rcpu->refcnt)) {
+ /* The queue should be empty at this point */
+ __cpu_map_ring_cleanup(rcpu->queue);
+ ptr_ring_cleanup(rcpu->queue, NULL);
+ kfree(rcpu->queue);
+ kfree(rcpu);
+ }
+}
+
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
@@ -278,7 +246,7 @@ static int cpu_map_kthread_run(void *data)
*/
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
unsigned int processed = 0, drops = 0, sched = 0;
- struct xdp_pkt *xdp_pkt;
+ struct xdp_frame *xdpf;
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -301,13 +269,13 @@ static int cpu_map_kthread_run(void *data)
* kthread CPU pinned. Lockless access to ptr_ring
* consume side valid as no-resize allowed of queue.
*/
- while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+ while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
struct sk_buff *skb;
int ret;
- skb = cpu_map_build_skb(rcpu, xdp_pkt);
+ skb = cpu_map_build_skb(rcpu, xdpf);
if (!skb) {
- page_frag_free(xdp_pkt);
+ xdp_return_frame(xdpf);
continue;
}
@@ -604,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
spin_lock(&q->producer_lock);
for (i = 0; i < bq->count; i++) {
- void *xdp_pkt = bq->q[i];
+ struct xdp_frame *xdpf = bq->q[i];
int err;
- err = __ptr_ring_produce(q, xdp_pkt);
+ err = __ptr_ring_produce(q, xdpf);
if (err) {
drops++;
- page_frag_free(xdp_pkt); /* Free xdp_pkt */
+ xdp_return_frame_rx_napi(xdpf);
}
processed++;
}
@@ -625,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
@@ -636,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
* driver to code invoking us to finished, due to driver
* (e.g. ixgbe) recycle tricks based on page-refcnt.
*
- * Thus, incoming xdp_pkt is always queued here (else we race
+ * Thus, incoming xdp_frame is always queued here (else we race
* with another CPU on page-refcnt and remaining driver code).
* Queue time is very short, as driver will invoke flush
* operation, when completing napi->poll call.
*/
- bq->q[bq->count++] = xdp_pkt;
+ bq->q[bq->count++] = xdpf;
return 0;
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
- struct xdp_pkt *xdp_pkt;
+ struct xdp_frame *xdpf;
- xdp_pkt = convert_to_xdp_pkt(xdp);
- if (unlikely(!xdp_pkt))
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
return -EOVERFLOW;
/* Info needed when constructing SKB on remote CPU */
- xdp_pkt->dev_rx = dev_rx;
+ xdpf->dev_rx = dev_rx;
- bq_enqueue(rcpu, xdp_pkt);
+ bq_enqueue(rcpu, xdpf);
return 0;
}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 565f9ece9115..a7cc7b3494a9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,15 +48,25 @@
* calls will fail at this point.
*/
#include <linux/bpf.h>
+#include <net/xdp.h>
#include <linux/filter.h>
+#include <trace/events/xdp.h>
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define DEV_MAP_BULK_SIZE 16
+struct xdp_bulk_queue {
+ struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+ struct net_device *dev_rx;
+ unsigned int count;
+};
+
struct bpf_dtab_netdev {
- struct net_device *dev;
+ struct net_device *dev; /* must be first member, due to tracepoint */
struct bpf_dtab *dtab;
unsigned int bit;
+ struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu;
};
@@ -206,6 +216,50 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
__set_bit(bit, bitmap);
}
+static int bq_xmit_all(struct bpf_dtab_netdev *obj,
+ struct xdp_bulk_queue *bq, u32 flags)
+{
+ struct net_device *dev = obj->dev;
+ int sent = 0, drops = 0, err = 0;
+ int i;
+
+ if (unlikely(!bq->count))
+ return 0;
+
+ for (i = 0; i < bq->count; i++) {
+ struct xdp_frame *xdpf = bq->q[i];
+
+ prefetch(xdpf);
+ }
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+ if (sent < 0) {
+ err = sent;
+ sent = 0;
+ goto error;
+ }
+ drops = bq->count - sent;
+out:
+ bq->count = 0;
+
+ trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+ sent, drops, bq->dev_rx, dev, err);
+ bq->dev_rx = NULL;
+ return 0;
+error:
+ /* If ndo_xdp_xmit fails with an errno, no frames have been
+ * xmit'ed and it's our responsibility to them free all.
+ */
+ for (i = 0; i < bq->count; i++) {
+ struct xdp_frame *xdpf = bq->q[i];
+
+ /* RX path under NAPI protection, can return frames faster */
+ xdp_return_frame_rx_napi(xdpf);
+ drops++;
+ }
+ goto out;
+}
+
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
@@ -221,7 +275,7 @@ void __dev_map_flush(struct bpf_map *map)
for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
- struct net_device *netdev;
+ struct xdp_bulk_queue *bq;
/* This is possible if the dev entry is removed by user space
* between xdp redirect and flush op.
@@ -230,9 +284,9 @@ void __dev_map_flush(struct bpf_map *map)
continue;
__clear_bit(bit, bitmap);
- netdev = dev->dev;
- if (likely(netdev->netdev_ops->ndo_xdp_flush))
- netdev->netdev_ops->ndo_xdp_flush(netdev);
+
+ bq = this_cpu_ptr(dev->bulkq);
+ bq_xmit_all(dev, bq, XDP_XMIT_FLUSH);
}
}
@@ -240,37 +294,79 @@ void __dev_map_flush(struct bpf_map *map)
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
*/
-struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct bpf_dtab_netdev *dev;
+ struct bpf_dtab_netdev *obj;
if (key >= map->max_entries)
return NULL;
- dev = READ_ONCE(dtab->netdev_map[key]);
- return dev ? dev->dev : NULL;
+ obj = READ_ONCE(dtab->netdev_map[key]);
+ return obj;
+}
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+ struct net_device *dev_rx)
+
+{
+ struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+
+ if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
+ bq_xmit_all(obj, bq, 0);
+
+ /* Ingress dev_rx will be the same for all xdp_frame's in
+ * bulk_queue, because bq stored per-CPU and must be flushed
+ * from net_device drivers NAPI func end.
+ */
+ if (!bq->dev_rx)
+ bq->dev_rx = dev_rx;
+
+ bq->q[bq->count++] = xdpf;
+ return 0;
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct net_device *dev = dst->dev;
+ struct xdp_frame *xdpf;
+
+ if (!dev->netdev_ops->ndo_xdp_xmit)
+ return -EOPNOTSUPP;
+
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ return bq_enqueue(dst, xdpf, dev_rx);
}
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
- struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+ struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+ struct net_device *dev = obj ? obj->dev : NULL;
return dev ? &dev->ifindex : NULL;
}
static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
- if (dev->dev->netdev_ops->ndo_xdp_flush) {
- struct net_device *fl = dev->dev;
+ if (dev->dev->netdev_ops->ndo_xdp_xmit) {
+ struct xdp_bulk_queue *bq;
unsigned long *bitmap;
+
int cpu;
for_each_online_cpu(cpu) {
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
__clear_bit(dev->bit, bitmap);
- fl->netdev_ops->ndo_xdp_flush(dev->dev);
+ bq = per_cpu_ptr(dev->bulkq, cpu);
+ bq_xmit_all(dev, bq, XDP_XMIT_FLUSH);
}
}
}
@@ -281,6 +377,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
dev_map_flush_old(dev);
+ free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -313,6 +410,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct net *net = current->nsproxy->net_ns;
+ gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev, *old_dev;
u32 i = *(u32 *)key;
u32 ifindex = *(u32 *)value;
@@ -327,13 +425,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
if (!ifindex) {
dev = NULL;
} else {
- dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
- map->numa_node);
+ dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
if (!dev)
return -ENOMEM;
+ dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+ sizeof(void *), gfp);
+ if (!dev->bulkq) {
+ kfree(dev);
+ return -ENOMEM;
+ }
+
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
+ free_percpu(dev->bulkq);
kfree(dev);
return -EINVAL;
}
@@ -405,6 +510,9 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
+ /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
+ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
+ offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
return 0;
}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 8740406df2cd..d6b76377cb6e 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -113,16 +113,16 @@ static const char *const bpf_jmp_string[16] = {
};
static void print_bpf_end_insn(bpf_insn_print_t verbose,
- struct bpf_verifier_env *env,
+ void *private_data,
const struct bpf_insn *insn)
{
- verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+ verbose(private_data, "(%02x) r%d = %s%d r%d\n",
+ insn->code, insn->dst_reg,
BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
insn->imm, insn->dst_reg);
}
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
- struct bpf_verifier_env *env,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
{
@@ -132,23 +132,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
if (class == BPF_ALU || class == BPF_ALU64) {
if (BPF_OP(insn->code) == BPF_END) {
if (class == BPF_ALU64)
- verbose(env, "BUG_alu64_%02x\n", insn->code);
+ verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code);
else
- print_bpf_end_insn(verbose, env, insn);
+ print_bpf_end_insn(verbose, cbs->private_data, insn);
} else if (BPF_OP(insn->code) == BPF_NEG) {
- verbose(env, "(%02x) r%d = %s-r%d\n",
+ verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n",
insn->code, insn->dst_reg,
class == BPF_ALU ? "(u32) " : "",
insn->dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(env, "(%02x) %sr%d %s %sr%d\n",
+ verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n",
insn->code, class == BPF_ALU ? "(u32) " : "",
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
class == BPF_ALU ? "(u32) " : "",
insn->src_reg);
} else {
- verbose(env, "(%02x) %sr%d %s %s%d\n",
+ verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n",
insn->code, class == BPF_ALU ? "(u32) " : "",
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
@@ -157,46 +157,46 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
}
} else if (class == BPF_STX) {
if (BPF_MODE(insn->code) == BPF_MEM)
- verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg,
insn->off, insn->src_reg);
else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg, insn->off,
insn->src_reg);
else
- verbose(env, "BUG_%02x\n", insn->code);
+ verbose(cbs->private_data, "BUG_%02x\n", insn->code);
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose(env, "BUG_st_%02x\n", insn->code);
+ verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
return;
}
- verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg,
insn->off, insn->imm);
} else if (class == BPF_LDX) {
if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose(env, "BUG_ldx_%02x\n", insn->code);
+ verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
return;
}
- verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+ verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n",
insn->code, insn->dst_reg,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->off);
} else if (class == BPF_LD) {
if (BPF_MODE(insn->code) == BPF_ABS) {
- verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+ verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->imm);
} else if (BPF_MODE(insn->code) == BPF_IND) {
- verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->imm);
@@ -212,12 +212,12 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
if (map_ptr && !allow_ptr_leaks)
imm = 0;
- verbose(env, "(%02x) r%d = %s\n",
+ verbose(cbs->private_data, "(%02x) r%d = %s\n",
insn->code, insn->dst_reg,
__func_imm_name(cbs, insn, imm,
tmp, sizeof(tmp)));
} else {
- verbose(env, "BUG_ld_%02x\n", insn->code);
+ verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code);
return;
}
} else if (class == BPF_JMP) {
@@ -227,35 +227,35 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
char tmp[64];
if (insn->src_reg == BPF_PSEUDO_CALL) {
- verbose(env, "(%02x) call pc%s\n",
+ verbose(cbs->private_data, "(%02x) call pc%s\n",
insn->code,
__func_get_name(cbs, insn,
tmp, sizeof(tmp)));
} else {
strcpy(tmp, "unknown");
- verbose(env, "(%02x) call %s#%d\n", insn->code,
+ verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code,
__func_get_name(cbs, insn,
tmp, sizeof(tmp)),
insn->imm);
}
} else if (insn->code == (BPF_JMP | BPF_JA)) {
- verbose(env, "(%02x) goto pc%+d\n",
+ verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
- verbose(env, "(%02x) exit\n", insn->code);
+ verbose(cbs->private_data, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+ verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n",
insn->code, insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
insn->src_reg, insn->off);
} else {
- verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+ verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n",
insn->code, insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
insn->imm, insn->off);
}
} else {
- verbose(env, "(%02x) %s\n",
+ verbose(cbs->private_data, "(%02x) %s\n",
insn->code, bpf_class_string[class]);
}
}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index 266fe8ee542b..e1324a834a24 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -22,14 +22,12 @@
#include <string.h>
#endif
-struct bpf_verifier_env;
-
extern const char *const bpf_alu_string[16];
extern const char *const bpf_class_string[8];
const char *func_id_name(int id);
-typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
+typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data,
const char *, ...);
typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
const struct bpf_insn *insn);
@@ -45,7 +43,6 @@ struct bpf_insn_cbs {
};
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
- struct bpf_verifier_env *env,
const struct bpf_insn *insn,
bool allow_ptr_leaks);
#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b76828f23b49..3ca2198a6d22 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
- *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
@@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
const int ret = BPF_REG_0;
const int ref_reg = BPF_REG_1;
- *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
offsetof(struct htab_elem, lru_node) +
@@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
- *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d24e238221e..73065e2d23c2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE,
};
+
+#ifdef CONFIG_CGROUPS
+BPF_CALL_0(bpf_get_current_cgroup_id)
+{
+ struct cgroup *cgrp = task_dfl_cgroup(current);
+
+ return cgrp->kn->id.id;
+}
+
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
+ .func = bpf_get_current_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 81e2f6995adb..ed13645bd80c 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,8 +150,154 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return 0;
}
+struct map_iter {
+ void *key;
+ bool done;
+};
+
+static struct map_iter *map_iter(struct seq_file *m)
+{
+ return m->private;
+}
+
+static struct bpf_map *seq_file_to_map(struct seq_file *m)
+{
+ return file_inode(m->file)->i_private;
+}
+
+static void map_iter_free(struct map_iter *iter)
+{
+ if (iter) {
+ kfree(iter->key);
+ kfree(iter);
+ }
+}
+
+static struct map_iter *map_iter_alloc(struct bpf_map *map)
+{
+ struct map_iter *iter;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN);
+ if (!iter)
+ goto error;
+
+ iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!iter->key)
+ goto error;
+
+ return iter;
+
+error:
+ map_iter_free(iter);
+ return NULL;
+}
+
+static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct bpf_map *map = seq_file_to_map(m);
+ void *key = map_iter(m)->key;
+
+ if (map_iter(m)->done)
+ return NULL;
+
+ if (unlikely(v == SEQ_START_TOKEN))
+ goto done;
+
+ if (map->ops->map_get_next_key(map, key, key)) {
+ map_iter(m)->done = true;
+ return NULL;
+ }
+
+done:
+ ++(*pos);
+ return key;
+}
+
+static void *map_seq_start(struct seq_file *m, loff_t *pos)
+{
+ if (map_iter(m)->done)
+ return NULL;
+
+ return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
+}
+
+static void map_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static int map_seq_show(struct seq_file *m, void *v)
+{
+ struct bpf_map *map = seq_file_to_map(m);
+ void *key = map_iter(m)->key;
+
+ if (unlikely(v == SEQ_START_TOKEN)) {
+ seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
+ seq_puts(m, "# WARNING!! The output format will change\n");
+ } else {
+ map->ops->map_seq_show_elem(map, key, m);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations bpffs_map_seq_ops = {
+ .start = map_seq_start,
+ .next = map_seq_next,
+ .show = map_seq_show,
+ .stop = map_seq_stop,
+};
+
+static int bpffs_map_open(struct inode *inode, struct file *file)
+{
+ struct bpf_map *map = inode->i_private;
+ struct map_iter *iter;
+ struct seq_file *m;
+ int err;
+
+ iter = map_iter_alloc(map);
+ if (!iter)
+ return -ENOMEM;
+
+ err = seq_open(file, &bpffs_map_seq_ops);
+ if (err) {
+ map_iter_free(iter);
+ return err;
+ }
+
+ m = file->private_data;
+ m->private = iter;
+
+ return 0;
+}
+
+static int bpffs_map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+
+ map_iter_free(map_iter(m));
+
+ return seq_release(inode, file);
+}
+
+/* bpffs_map_fops should only implement the basic
+ * read operation for a BPF map. The purpose is to
+ * provide a simple user intuitive way to do
+ * "cat bpffs/pathto/a-pinned-map".
+ *
+ * Other operations (e.g. write, lookup...) should be realized by
+ * the userspace tools (e.g. bpftool) through the
+ * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
+ * interface.
+ */
+static const struct file_operations bpffs_map_fops = {
+ .open = bpffs_map_open,
+ .read = seq_read,
+ .release = bpffs_map_release,
+};
+
static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
- const struct inode_operations *iops)
+ const struct inode_operations *iops,
+ const struct file_operations *fops)
{
struct inode *dir = dentry->d_parent->d_inode;
struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
@@ -159,6 +305,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
return PTR_ERR(inode);
inode->i_op = iops;
+ inode->i_fop = fops;
inode->i_private = raw;
bpf_dentry_finalize(dentry, inode, dir);
@@ -167,17 +314,23 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
{
- return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL);
}
static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
{
- return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
+ struct bpf_map *map = arg;
+
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
+ map->btf ? &bpffs_map_fops : NULL);
}
static struct dentry *
bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
{
+ /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
+ * extensions.
+ */
if (strchr(dentry->d_name.name, '.'))
return ERR_PTR(-EPERM);
@@ -276,13 +429,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
ret = bpf_obj_do_pin(pname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
- if ((trace_bpf_obj_pin_prog_enabled() ||
- trace_bpf_obj_pin_map_enabled()) && !ret) {
- if (type == BPF_TYPE_PROG)
- trace_bpf_obj_pin_prog(raw, ufd, pname);
- if (type == BPF_TYPE_MAP)
- trace_bpf_obj_pin_map(raw, ufd, pname);
- }
out:
putname(pname);
return ret;
@@ -349,15 +495,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
else
goto out;
- if (ret < 0) {
+ if (ret < 0)
bpf_any_put(raw, type);
- } else if (trace_bpf_obj_get_prog_enabled() ||
- trace_bpf_obj_get_map_enabled()) {
- if (type == BPF_TYPE_PROG)
- trace_bpf_obj_get_prog(raw, ret, pname);
- if (type == BPF_TYPE_MAP)
- trace_bpf_obj_get_map(raw, ret, pname);
- }
out:
putname(pname);
return ret;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index c9401075b58c..ac747d5cf7c6 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
*
* This software is licensed under the GNU General License Version 2,
* June 1991 as shown in the file COPYING in the top-level directory of this
@@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
struct bpf_prog_offload *offload;
bool ret;
- if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
+ if (!bpf_prog_is_dev_bound(prog->aux))
return false;
+ if (!bpf_map_is_dev_bound(map))
+ return bpf_map_offload_neutral(map);
down_read(&bpf_devs_lock);
offload = prog->aux->offload;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a927e89dad6e..52a91d816c0e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -38,17 +38,48 @@
#include <linux/skbuff.h>
#include <linux/workqueue.h>
#include <linux/list.h>
+#include <linux/mm.h>
#include <net/strparser.h>
#include <net/tcp.h>
+#include <linux/ptr_ring.h>
+#include <net/inet_common.h>
+#include <linux/sched/signal.h>
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+struct bpf_sock_progs {
+ struct bpf_prog *bpf_tx_msg;
+ struct bpf_prog *bpf_parse;
+ struct bpf_prog *bpf_verdict;
+};
+
struct bpf_stab {
struct bpf_map map;
struct sock **sock_map;
- struct bpf_prog *bpf_parse;
- struct bpf_prog *bpf_verdict;
+ struct bpf_sock_progs progs;
+};
+
+struct bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct bucket *buckets;
+ atomic_t count;
+ u32 n_buckets;
+ u32 elem_size;
+ struct bpf_sock_progs progs;
+};
+
+struct htab_elem {
+ struct rcu_head rcu;
+ struct hlist_node hash_node;
+ u32 hash;
+ struct sock *sk;
+ char key[0];
};
enum smap_psock_state {
@@ -58,12 +89,13 @@ enum smap_psock_state {
struct smap_psock_map_entry {
struct list_head list;
struct sock **entry;
+ struct htab_elem *hash_link;
+ struct bpf_htab *htab;
};
struct smap_psock {
struct rcu_head rcu;
- /* refcnt is used inside sk_callback_lock */
- u32 refcnt;
+ refcount_t refcnt;
/* datapath variables */
struct sk_buff_head rxqueue;
@@ -74,7 +106,17 @@ struct smap_psock {
int save_off;
struct sk_buff *save_skb;
+ /* datapath variables for tx_msg ULP */
+ struct sock *sk_redir;
+ int apply_bytes;
+ int cork_bytes;
+ int sg_size;
+ int eval;
+ struct sk_msg_buff *cork;
+ struct list_head ingress;
+
struct strparser strp;
+ struct bpf_prog *bpf_tx_msg;
struct bpf_prog *bpf_parse;
struct bpf_prog *bpf_verdict;
struct list_head maps;
@@ -92,11 +134,33 @@ struct smap_psock {
void (*save_write_space)(struct sock *sk);
};
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len);
+static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags);
+
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
{
return rcu_dereference_sk_user_data(sk);
}
+static bool bpf_tcp_stream_read(const struct sock *sk)
+{
+ struct smap_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ goto out;
+ empty = list_empty(&psock->ingress);
+out:
+ rcu_read_unlock();
+ return !empty;
+}
+
static struct proto tcp_bpf_proto;
static int bpf_tcp_init(struct sock *sk)
{
@@ -116,31 +180,56 @@ static int bpf_tcp_init(struct sock *sk)
psock->save_close = sk->sk_prot->close;
psock->sk_proto = sk->sk_prot;
+
+ if (psock->bpf_tx_msg) {
+ tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
+ tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
+ tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg;
+ tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read;
+ }
+
sk->sk_prot = &tcp_bpf_proto;
rcu_read_unlock();
return 0;
}
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md);
+
static void bpf_tcp_release(struct sock *sk)
{
struct smap_psock *psock;
rcu_read_lock();
psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ goto out;
- if (likely(psock)) {
+ if (psock->cork) {
+ free_start_sg(psock->sock, psock->cork);
+ kfree(psock->cork);
+ psock->cork = NULL;
+ }
+
+ if (psock->sk_proto) {
sk->sk_prot = psock->sk_proto;
psock->sk_proto = NULL;
}
+out:
rcu_read_unlock();
}
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+ atomic_dec(&htab->count);
+ kfree_rcu(l, rcu);
+}
static void bpf_tcp_close(struct sock *sk, long timeout)
{
void (*close_fun)(struct sock *sk, long timeout);
struct smap_psock_map_entry *e, *tmp;
+ struct sk_msg_buff *md, *mtmp;
struct smap_psock *psock;
struct sock *osk;
@@ -159,11 +248,29 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
close_fun = psock->save_close;
write_lock_bh(&sk->sk_callback_lock);
+ if (psock->cork) {
+ free_start_sg(psock->sock, psock->cork);
+ kfree(psock->cork);
+ psock->cork = NULL;
+ }
+
+ list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
+ list_del(&md->list);
+ free_start_sg(psock->sock, md);
+ kfree(md);
+ }
+
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- osk = cmpxchg(e->entry, sk, NULL);
- if (osk == sk) {
- list_del(&e->list);
- smap_release_sock(psock, sk);
+ if (e->entry) {
+ osk = cmpxchg(e->entry, sk, NULL);
+ if (osk == sk) {
+ list_del(&e->list);
+ smap_release_sock(psock, sk);
+ }
+ } else {
+ hlist_del_rcu(&e->hash_link->hash_node);
+ smap_release_sock(psock, e->hash_link->sk);
+ free_htab_elem(e->htab, e->hash_link);
}
}
write_unlock_bh(&sk->sk_callback_lock);
@@ -175,6 +282,7 @@ enum __sk_action {
__SK_DROP = 0,
__SK_PASS,
__SK_REDIRECT,
+ __SK_NONE,
};
static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
@@ -186,10 +294,829 @@ static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
.release = bpf_tcp_release,
};
+static int memcopy_from_iter(struct sock *sk,
+ struct sk_msg_buff *md,
+ struct iov_iter *from, int bytes)
+{
+ struct scatterlist *sg = md->sg_data;
+ int i = md->sg_curr, rc = -ENOSPC;
+
+ do {
+ int copy;
+ char *to;
+
+ if (md->sg_copybreak >= sg[i].length) {
+ md->sg_copybreak = 0;
+
+ if (++i == MAX_SKB_FRAGS)
+ i = 0;
+
+ if (i == md->sg_end)
+ break;
+ }
+
+ copy = sg[i].length - md->sg_copybreak;
+ to = sg_virt(&sg[i]) + md->sg_copybreak;
+ md->sg_copybreak += copy;
+
+ if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
+ rc = copy_from_iter_nocache(to, copy, from);
+ else
+ rc = copy_from_iter(to, copy, from);
+
+ if (rc != copy) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ bytes -= copy;
+ if (!bytes)
+ break;
+
+ md->sg_copybreak = 0;
+ if (++i == MAX_SKB_FRAGS)
+ i = 0;
+ } while (i != md->sg_end);
+out:
+ md->sg_curr = i;
+ return rc;
+}
+
+static int bpf_tcp_push(struct sock *sk, int apply_bytes,
+ struct sk_msg_buff *md,
+ int flags, bool uncharge)
+{
+ bool apply = apply_bytes;
+ struct scatterlist *sg;
+ int offset, ret = 0;
+ struct page *p;
+ size_t size;
+
+ while (1) {
+ sg = md->sg_data + md->sg_start;
+ size = (apply && apply_bytes < sg->length) ?
+ apply_bytes : sg->length;
+ offset = sg->offset;
+
+ tcp_rate_check_app_limited(sk);
+ p = sg_page(sg);
+retry:
+ ret = do_tcp_sendpages(sk, p, offset, size, flags);
+ if (ret != size) {
+ if (ret > 0) {
+ if (apply)
+ apply_bytes -= ret;
+
+ sg->offset += ret;
+ sg->length -= ret;
+ size -= ret;
+ offset += ret;
+ if (uncharge)
+ sk_mem_uncharge(sk, ret);
+ goto retry;
+ }
+
+ return ret;
+ }
+
+ if (apply)
+ apply_bytes -= ret;
+ sg->offset += ret;
+ sg->length -= ret;
+ if (uncharge)
+ sk_mem_uncharge(sk, ret);
+
+ if (!sg->length) {
+ put_page(p);
+ md->sg_start++;
+ if (md->sg_start == MAX_SKB_FRAGS)
+ md->sg_start = 0;
+ sg_init_table(sg, 1);
+
+ if (md->sg_start == md->sg_end)
+ break;
+ }
+
+ if (apply && !apply_bytes)
+ break;
+ }
+ return 0;
+}
+
+static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md)
+{
+ struct scatterlist *sg = md->sg_data + md->sg_start;
+
+ if (md->sg_copy[md->sg_start]) {
+ md->data = md->data_end = 0;
+ } else {
+ md->data = sg_virt(sg);
+ md->data_end = md->data + sg->length;
+ }
+}
+
+static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+{
+ struct scatterlist *sg = md->sg_data;
+ int i = md->sg_start;
+
+ do {
+ int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length;
+
+ sk_mem_uncharge(sk, uncharge);
+ bytes -= uncharge;
+ if (!bytes)
+ break;
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ } while (i != md->sg_end);
+}
+
+static void free_bytes_sg(struct sock *sk, int bytes,
+ struct sk_msg_buff *md, bool charge)
+{
+ struct scatterlist *sg = md->sg_data;
+ int i = md->sg_start, free;
+
+ while (bytes && sg[i].length) {
+ free = sg[i].length;
+ if (bytes < free) {
+ sg[i].length -= bytes;
+ sg[i].offset += bytes;
+ if (charge)
+ sk_mem_uncharge(sk, bytes);
+ break;
+ }
+
+ if (charge)
+ sk_mem_uncharge(sk, sg[i].length);
+ put_page(sg_page(&sg[i]));
+ bytes -= sg[i].length;
+ sg[i].length = 0;
+ sg[i].page_link = 0;
+ sg[i].offset = 0;
+ i++;
+
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ }
+ md->sg_start = i;
+}
+
+static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
+{
+ struct scatterlist *sg = md->sg_data;
+ int i = start, free = 0;
+
+ while (sg[i].length) {
+ free += sg[i].length;
+ sk_mem_uncharge(sk, sg[i].length);
+ put_page(sg_page(&sg[i]));
+ sg[i].length = 0;
+ sg[i].page_link = 0;
+ sg[i].offset = 0;
+ i++;
+
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ }
+
+ return free;
+}
+
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md)
+{
+ int free = free_sg(sk, md->sg_start, md);
+
+ md->sg_start = md->sg_end;
+ return free;
+}
+
+static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
+{
+ return free_sg(sk, md->sg_curr, md);
+}
+
+static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
+{
+ return ((_rc == SK_PASS) ?
+ (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
+ __SK_DROP);
+}
+
+static unsigned int smap_do_tx_msg(struct sock *sk,
+ struct smap_psock *psock,
+ struct sk_msg_buff *md)
+{
+ struct bpf_prog *prog;
+ unsigned int rc, _rc;
+
+ preempt_disable();
+ rcu_read_lock();
+
+ /* If the policy was removed mid-send then default to 'accept' */
+ prog = READ_ONCE(psock->bpf_tx_msg);
+ if (unlikely(!prog)) {
+ _rc = SK_PASS;
+ goto verdict;
+ }
+
+ bpf_compute_data_pointers_sg(md);
+ md->sk = sk;
+ rc = (*prog->bpf_func)(md, prog->insnsi);
+ psock->apply_bytes = md->apply_bytes;
+
+ /* Moving return codes from UAPI namespace into internal namespace */
+ _rc = bpf_map_msg_verdict(rc, md);
+
+ /* The psock has a refcount on the sock but not on the map and because
+ * we need to drop rcu read lock here its possible the map could be
+ * removed between here and when we need it to execute the sock
+ * redirect. So do the map lookup now for future use.
+ */
+ if (_rc == __SK_REDIRECT) {
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
+ psock->sk_redir = do_msg_redirect_map(md);
+ if (!psock->sk_redir) {
+ _rc = __SK_DROP;
+ goto verdict;
+ }
+ sock_hold(psock->sk_redir);
+ }
+verdict:
+ rcu_read_unlock();
+ preempt_enable();
+
+ return _rc;
+}
+
+static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
+ struct smap_psock *psock,
+ struct sk_msg_buff *md, int flags)
+{
+ bool apply = apply_bytes;
+ size_t size, copied = 0;
+ struct sk_msg_buff *r;
+ int err = 0, i;
+
+ r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL);
+ if (unlikely(!r))
+ return -ENOMEM;
+
+ lock_sock(sk);
+ r->sg_start = md->sg_start;
+ i = md->sg_start;
+
+ do {
+ size = (apply && apply_bytes < md->sg_data[i].length) ?
+ apply_bytes : md->sg_data[i].length;
+
+ if (!sk_wmem_schedule(sk, size)) {
+ if (!copied)
+ err = -ENOMEM;
+ break;
+ }
+
+ sk_mem_charge(sk, size);
+ r->sg_data[i] = md->sg_data[i];
+ r->sg_data[i].length = size;
+ md->sg_data[i].length -= size;
+ md->sg_data[i].offset += size;
+ copied += size;
+
+ if (md->sg_data[i].length) {
+ get_page(sg_page(&r->sg_data[i]));
+ r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1;
+ } else {
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ r->sg_end = i;
+ }
+
+ if (apply) {
+ apply_bytes -= size;
+ if (!apply_bytes)
+ break;
+ }
+ } while (i != md->sg_end);
+
+ md->sg_start = i;
+
+ if (!err) {
+ list_add_tail(&r->list, &psock->ingress);
+ sk->sk_data_ready(sk);
+ } else {
+ free_start_sg(sk, r);
+ kfree(r);
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
+ struct sk_msg_buff *md,
+ int flags)
+{
+ bool ingress = !!(md->flags & BPF_F_INGRESS);
+ struct smap_psock *psock;
+ struct scatterlist *sg;
+ int err = 0;
+
+ sg = md->sg_data;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ goto out_rcu;
+
+ if (!refcount_inc_not_zero(&psock->refcnt))
+ goto out_rcu;
+
+ rcu_read_unlock();
+
+ if (ingress) {
+ err = bpf_tcp_ingress(sk, send, psock, md, flags);
+ } else {
+ lock_sock(sk);
+ err = bpf_tcp_push(sk, send, md, flags, false);
+ release_sock(sk);
+ }
+ smap_release_sock(psock, sk);
+ if (unlikely(err))
+ goto out;
+ return 0;
+out_rcu:
+ rcu_read_unlock();
+out:
+ free_bytes_sg(NULL, send, md, false);
+ return err;
+}
+
+static inline void bpf_md_init(struct smap_psock *psock)
+{
+ if (!psock->apply_bytes) {
+ psock->eval = __SK_NONE;
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ }
+}
+
+static void apply_bytes_dec(struct smap_psock *psock, int i)
+{
+ if (psock->apply_bytes) {
+ if (psock->apply_bytes < i)
+ psock->apply_bytes = 0;
+ else
+ psock->apply_bytes -= i;
+ }
+}
+
+static int bpf_exec_tx_verdict(struct smap_psock *psock,
+ struct sk_msg_buff *m,
+ struct sock *sk,
+ int *copied, int flags)
+{
+ bool cork = false, enospc = (m->sg_start == m->sg_end);
+ struct sock *redir;
+ int err = 0;
+ int send;
+
+more_data:
+ if (psock->eval == __SK_NONE)
+ psock->eval = smap_do_tx_msg(sk, psock, m);
+
+ if (m->cork_bytes &&
+ m->cork_bytes > psock->sg_size && !enospc) {
+ psock->cork_bytes = m->cork_bytes - psock->sg_size;
+ if (!psock->cork) {
+ psock->cork = kcalloc(1,
+ sizeof(struct sk_msg_buff),
+ GFP_ATOMIC | __GFP_NOWARN);
+
+ if (!psock->cork) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+ }
+ memcpy(psock->cork, m, sizeof(*m));
+ goto out_err;
+ }
+
+ send = psock->sg_size;
+ if (psock->apply_bytes && psock->apply_bytes < send)
+ send = psock->apply_bytes;
+
+ switch (psock->eval) {
+ case __SK_PASS:
+ err = bpf_tcp_push(sk, send, m, flags, true);
+ if (unlikely(err)) {
+ *copied -= free_start_sg(sk, m);
+ break;
+ }
+
+ apply_bytes_dec(psock, send);
+ psock->sg_size -= send;
+ break;
+ case __SK_REDIRECT:
+ redir = psock->sk_redir;
+ apply_bytes_dec(psock, send);
+
+ if (psock->cork) {
+ cork = true;
+ psock->cork = NULL;
+ }
+
+ return_mem_sg(sk, send, m);
+ release_sock(sk);
+
+ err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
+ lock_sock(sk);
+
+ if (unlikely(err < 0)) {
+ free_start_sg(sk, m);
+ psock->sg_size = 0;
+ if (!cork)
+ *copied -= send;
+ } else {
+ psock->sg_size -= send;
+ }
+
+ if (cork) {
+ free_start_sg(sk, m);
+ psock->sg_size = 0;
+ kfree(m);
+ m = NULL;
+ err = 0;
+ }
+ break;
+ case __SK_DROP:
+ default:
+ free_bytes_sg(sk, send, m, true);
+ apply_bytes_dec(psock, send);
+ *copied -= send;
+ psock->sg_size -= send;
+ err = -EACCES;
+ break;
+ }
+
+ if (likely(!err)) {
+ bpf_md_init(psock);
+ if (m &&
+ m->sg_data[m->sg_start].page_link &&
+ m->sg_data[m->sg_start].length)
+ goto more_data;
+ }
+
+out_err:
+ return err;
+}
+
+static int bpf_wait_data(struct sock *sk,
+ struct smap_psock *psk, int flags,
+ long timeo, int *err)
+{
+ int rc;
+
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ rc = sk_wait_event(sk, &timeo,
+ !list_empty(&psk->ingress) ||
+ !skb_queue_empty(&sk->sk_receive_queue),
+ &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ return rc;
+}
+
+static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ struct smap_psock *psock;
+ int copied = 0;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ goto out;
+
+ if (unlikely(!refcount_inc_not_zero(&psock->refcnt)))
+ goto out;
+ rcu_read_unlock();
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+ lock_sock(sk);
+bytes_ready:
+ while (copied != len) {
+ struct scatterlist *sg;
+ struct sk_msg_buff *md;
+ int i;
+
+ md = list_first_entry_or_null(&psock->ingress,
+ struct sk_msg_buff, list);
+ if (unlikely(!md))
+ break;
+ i = md->sg_start;
+ do {
+ struct page *page;
+ int n, copy;
+
+ sg = &md->sg_data[i];
+ copy = sg->length;
+ page = sg_page(sg);
+
+ if (copied + copy > len)
+ copy = len - copied;
+
+ n = copy_page_to_iter(page, sg->offset, copy, iter);
+ if (n != copy) {
+ md->sg_start = i;
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ return -EFAULT;
+ }
+
+ copied += copy;
+ sg->offset += copy;
+ sg->length -= copy;
+ sk_mem_uncharge(sk, copy);
+
+ if (!sg->length) {
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ if (!md->skb)
+ put_page(page);
+ }
+ if (copied == len)
+ break;
+ } while (i != md->sg_end);
+ md->sg_start = i;
+
+ if (!sg->length && md->sg_start == md->sg_end) {
+ list_del(&md->list);
+ if (md->skb)
+ consume_skb(md->skb);
+ kfree(md);
+ }
+ }
+
+ if (!copied) {
+ long timeo;
+ int data;
+ int err = 0;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = bpf_wait_data(sk, psock, flags, timeo, &err);
+
+ if (data) {
+ if (!skb_queue_empty(&sk->sk_receive_queue)) {
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ return copied;
+ }
+ goto bytes_ready;
+ }
+
+ if (err)
+ copied = err;
+ }
+
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ return copied;
+out:
+ rcu_read_unlock();
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+}
+
+
+static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
+ struct sk_msg_buff md = {0};
+ unsigned int sg_copy = 0;
+ struct smap_psock *psock;
+ int copied = 0, err = 0;
+ struct scatterlist *sg;
+ long timeo;
+
+ /* Its possible a sock event or user removed the psock _but_ the ops
+ * have not been reprogrammed yet so we get here. In this case fallback
+ * to tcp_sendmsg. Note this only works because we _only_ ever allow
+ * a single ULP there is no hierarchy here.
+ */
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ return tcp_sendmsg(sk, msg, size);
+ }
+
+ /* Increment the psock refcnt to ensure its not released while sending a
+ * message. Required because sk lookup and bpf programs are used in
+ * separate rcu critical sections. Its OK if we lose the map entry
+ * but we can't lose the sock reference.
+ */
+ if (!refcount_inc_not_zero(&psock->refcnt)) {
+ rcu_read_unlock();
+ return tcp_sendmsg(sk, msg, size);
+ }
+
+ sg = md.sg_data;
+ sg_init_marker(sg, MAX_SKB_FRAGS);
+ rcu_read_unlock();
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ while (msg_data_left(msg)) {
+ struct sk_msg_buff *m;
+ bool enospc = false;
+ int copy;
+
+ if (sk->sk_err) {
+ err = sk->sk_err;
+ goto out_err;
+ }
+
+ copy = msg_data_left(msg);
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+
+ m = psock->cork_bytes ? psock->cork : &md;
+ m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end;
+ err = sk_alloc_sg(sk, copy, m->sg_data,
+ m->sg_start, &m->sg_end, &sg_copy,
+ m->sg_end - 1);
+ if (err) {
+ if (err != -ENOSPC)
+ goto wait_for_memory;
+ enospc = true;
+ copy = sg_copy;
+ }
+
+ err = memcopy_from_iter(sk, m, &msg->msg_iter, copy);
+ if (err < 0) {
+ free_curr_sg(sk, m);
+ goto out_err;
+ }
+
+ psock->sg_size += copy;
+ copied += copy;
+ sg_copy = 0;
+
+ /* When bytes are being corked skip running BPF program and
+ * applying verdict unless there is no more buffer space. In
+ * the ENOSPC case simply run BPF prorgram with currently
+ * accumulated data. We don't have much choice at this point
+ * we could try extending the page frags or chaining complex
+ * frags but even in these cases _eventually_ we will hit an
+ * OOM scenario. More complex recovery schemes may be
+ * implemented in the future, but BPF programs must handle
+ * the case where apply_cork requests are not honored. The
+ * canonical method to verify this is to check data length.
+ */
+ if (psock->cork_bytes) {
+ if (copy > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= copy;
+
+ if (psock->cork_bytes && !enospc)
+ goto out_cork;
+
+ /* All cork bytes accounted for re-run filter */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
+ if (unlikely(err < 0))
+ goto out_err;
+ continue;
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_err;
+ }
+out_err:
+ if (err < 0)
+ err = sk_stream_error(sk, msg->msg_flags, err);
+out_cork:
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ return copied ? copied : err;
+}
+
+static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct sk_msg_buff md = {0}, *m = NULL;
+ int err = 0, copied = 0;
+ struct smap_psock *psock;
+ struct scatterlist *sg;
+ bool enospc = false;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock))
+ goto accept;
+
+ if (!refcount_inc_not_zero(&psock->refcnt))
+ goto accept;
+ rcu_read_unlock();
+
+ lock_sock(sk);
+
+ if (psock->cork_bytes) {
+ m = psock->cork;
+ sg = &m->sg_data[m->sg_end];
+ } else {
+ m = &md;
+ sg = m->sg_data;
+ sg_init_marker(sg, MAX_SKB_FRAGS);
+ }
+
+ /* Catch case where ring is full and sendpage is stalled. */
+ if (unlikely(m->sg_end == m->sg_start &&
+ m->sg_data[m->sg_end].length))
+ goto out_err;
+
+ psock->sg_size += size;
+ sg_set_page(sg, page, size, offset);
+ get_page(page);
+ m->sg_copy[m->sg_end] = true;
+ sk_mem_charge(sk, size);
+ m->sg_end++;
+ copied = size;
+
+ if (m->sg_end == MAX_SKB_FRAGS)
+ m->sg_end = 0;
+
+ if (m->sg_end == m->sg_start)
+ enospc = true;
+
+ if (psock->cork_bytes) {
+ if (size > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= size;
+
+ if (psock->cork_bytes && !enospc)
+ goto out_err;
+
+ /* All cork bytes accounted for re-run filter */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
+out_err:
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ return copied ? copied : err;
+accept:
+ rcu_read_unlock();
+ return tcp_sendpage(sk, page, offset, size, flags);
+}
+
+static void bpf_tcp_msg_add(struct smap_psock *psock,
+ struct sock *sk,
+ struct bpf_prog *tx_msg)
+{
+ struct bpf_prog *orig_tx_msg;
+
+ orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg);
+ if (orig_tx_msg)
+ bpf_prog_put(orig_tx_msg);
+}
+
static int bpf_tcp_ulp_register(void)
{
tcp_bpf_proto = tcp_prot;
tcp_bpf_proto.close = bpf_tcp_close;
+ /* Once BPF TX ULP is registered it is never unregistered. It
+ * will be in the ULP list for the lifetime of the system. Doing
+ * duplicate registers is not a problem.
+ */
return tcp_register_ulp(&bpf_tcp_ulp_ops);
}
@@ -206,7 +1133,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
* when we orphan the skb so that we don't have the possibility
* to reference a stale map.
*/
- TCP_SKB_CB(skb)->bpf.map = NULL;
+ TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
skb->sk = psock->sock;
bpf_compute_data_pointers(skb);
preempt_disable();
@@ -216,31 +1143,76 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
/* Moving return codes from UAPI namespace into internal namespace */
return rc == SK_PASS ?
- (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
+ (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
__SK_DROP;
}
+static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
+{
+ struct sock *sk = psock->sock;
+ int copied = 0, num_sg;
+ struct sk_msg_buff *r;
+
+ r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
+ if (unlikely(!r))
+ return -EAGAIN;
+
+ if (!sk_rmem_schedule(sk, skb, skb->len)) {
+ kfree(r);
+ return -EAGAIN;
+ }
+
+ sg_init_table(r->sg_data, MAX_SKB_FRAGS);
+ num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
+ if (unlikely(num_sg < 0)) {
+ kfree(r);
+ return num_sg;
+ }
+ sk_mem_charge(sk, skb->len);
+ copied = skb->len;
+ r->sg_start = 0;
+ r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg;
+ r->skb = skb;
+ list_add_tail(&r->list, &psock->ingress);
+ sk->sk_data_ready(sk);
+ return copied;
+}
+
static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
{
+ struct smap_psock *peer;
struct sock *sk;
+ __u32 in;
int rc;
rc = smap_verdict_func(psock, skb);
switch (rc) {
case __SK_REDIRECT:
sk = do_sk_redirect_map(skb);
- if (likely(sk)) {
- struct smap_psock *peer = smap_psock_sk(sk);
-
- if (likely(peer &&
- test_bit(SMAP_TX_RUNNING, &peer->state) &&
- !sock_flag(sk, SOCK_DEAD) &&
- sock_writeable(sk))) {
- skb_set_owner_w(skb, sk);
- skb_queue_tail(&peer->rxqueue, skb);
- schedule_work(&peer->tx_work);
- break;
- }
+ if (!sk) {
+ kfree_skb(skb);
+ break;
+ }
+
+ peer = smap_psock_sk(sk);
+ in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
+
+ if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) ||
+ !test_bit(SMAP_TX_RUNNING, &peer->state))) {
+ kfree_skb(skb);
+ break;
+ }
+
+ if (!in && sock_writeable(sk)) {
+ skb_set_owner_w(skb, sk);
+ skb_queue_tail(&peer->rxqueue, skb);
+ schedule_work(&peer->tx_work);
+ break;
+ } else if (in &&
+ atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
+ skb_queue_tail(&peer->rxqueue, skb);
+ schedule_work(&peer->tx_work);
+ break;
}
/* Fall through and free skb otherwise */
case __SK_DROP:
@@ -302,15 +1274,23 @@ static void smap_tx_work(struct work_struct *w)
}
while ((skb = skb_dequeue(&psock->rxqueue))) {
+ __u32 flags;
+
rem = skb->len;
off = 0;
start:
+ flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
do {
- if (likely(psock->sock->sk_socket))
- n = skb_send_sock_locked(psock->sock,
- skb, off, rem);
- else
+ if (likely(psock->sock->sk_socket)) {
+ if (flags)
+ n = smap_do_ingress(psock, skb);
+ else
+ n = skb_send_sock_locked(psock->sock,
+ skb, off, rem);
+ } else {
n = -EINVAL;
+ }
+
if (n <= 0) {
if (n == -EAGAIN) {
/* Retry when space is available */
@@ -328,7 +1308,9 @@ start:
rem -= n;
off += n;
} while (rem);
- kfree_skb(skb);
+
+ if (!flags)
+ kfree_skb(skb);
}
out:
release_sock(psock->sock);
@@ -373,15 +1355,13 @@ static void smap_destroy_psock(struct rcu_head *rcu)
static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
{
- psock->refcnt--;
- if (psock->refcnt)
- return;
-
- tcp_cleanup_ulp(sock);
- smap_stop_sock(psock, sock);
- clear_bit(SMAP_TX_RUNNING, &psock->state);
- rcu_assign_sk_user_data(sock, NULL);
- call_rcu_sched(&psock->rcu, smap_destroy_psock);
+ if (refcount_dec_and_test(&psock->refcnt)) {
+ tcp_cleanup_ulp(sock);
+ smap_stop_sock(psock, sock);
+ clear_bit(SMAP_TX_RUNNING, &psock->state);
+ rcu_assign_sk_user_data(sock, NULL);
+ call_rcu_sched(&psock->rcu, smap_destroy_psock);
+ }
}
static int smap_parse_func_strparser(struct strparser *strp,
@@ -415,7 +1395,6 @@ static int smap_parse_func_strparser(struct strparser *strp,
return rc;
}
-
static int smap_read_sock_done(struct strparser *strp, int err)
{
return err;
@@ -434,7 +1413,6 @@ static int smap_init_sock(struct smap_psock *psock,
}
static void smap_init_progs(struct smap_psock *psock,
- struct bpf_stab *stab,
struct bpf_prog *verdict,
struct bpf_prog *parse)
{
@@ -469,6 +1447,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab)
static void smap_gc_work(struct work_struct *w)
{
struct smap_psock_map_entry *e, *tmp;
+ struct sk_msg_buff *md, *mtmp;
struct smap_psock *psock;
psock = container_of(w, struct smap_psock, gc_work);
@@ -485,33 +1464,50 @@ static void smap_gc_work(struct work_struct *w)
bpf_prog_put(psock->bpf_parse);
if (psock->bpf_verdict)
bpf_prog_put(psock->bpf_verdict);
+ if (psock->bpf_tx_msg)
+ bpf_prog_put(psock->bpf_tx_msg);
+
+ if (psock->cork) {
+ free_start_sg(psock->sock, psock->cork);
+ kfree(psock->cork);
+ }
+
+ list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
+ list_del(&md->list);
+ free_start_sg(psock->sock, md);
+ kfree(md);
+ }
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
list_del(&e->list);
kfree(e);
}
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
+
sock_put(psock->sock);
kfree(psock);
}
-static struct smap_psock *smap_init_psock(struct sock *sock,
- struct bpf_stab *stab)
+static struct smap_psock *smap_init_psock(struct sock *sock, int node)
{
struct smap_psock *psock;
psock = kzalloc_node(sizeof(struct smap_psock),
GFP_ATOMIC | __GFP_NOWARN,
- stab->map.numa_node);
+ node);
if (!psock)
return ERR_PTR(-ENOMEM);
+ psock->eval = __SK_NONE;
psock->sock = sock;
skb_queue_head_init(&psock->rxqueue);
INIT_WORK(&psock->tx_work, smap_tx_work);
INIT_WORK(&psock->gc_work, smap_gc_work);
INIT_LIST_HEAD(&psock->maps);
- psock->refcnt = 1;
+ INIT_LIST_HEAD(&psock->ingress);
+ refcount_set(&psock->refcnt, 1);
rcu_assign_sk_user_data(sock, psock);
sock_hold(sock);
@@ -532,9 +1528,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
- if (attr->value_size > KMALLOC_MAX_SIZE)
- return ERR_PTR(-E2BIG);
-
err = bpf_tcp_ulp_register();
if (err && err != -EEXIST)
return ERR_PTR(err);
@@ -571,12 +1564,14 @@ free_stab:
return ERR_PTR(err);
}
-static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+static void smap_list_remove(struct smap_psock *psock,
+ struct sock **entry,
+ struct htab_elem *hash_link)
{
struct smap_psock_map_entry *e, *tmp;
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- if (e->entry == entry) {
+ if (e->entry == entry || e->hash_link == hash_link) {
list_del(&e->list);
break;
}
@@ -614,7 +1609,7 @@ static void sock_map_free(struct bpf_map *map)
* to be null and queued for garbage collection.
*/
if (likely(psock)) {
- smap_list_remove(psock, &stab->sock_map[i]);
+ smap_list_remove(psock, &stab->sock_map[i], NULL);
smap_release_sock(psock, sock);
}
write_unlock_bh(&sock->sk_callback_lock);
@@ -673,7 +1668,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
if (psock->bpf_parse)
smap_stop_sock(psock, sock);
- smap_list_remove(psock, &stab->sock_map[k]);
+ smap_list_remove(psock, &stab->sock_map[k], NULL);
smap_release_sock(psock, sock);
out:
write_unlock_bh(&sock->sk_callback_lock);
@@ -708,38 +1703,26 @@ out:
* - sock_map must use READ_ONCE and (cmp)xchg operations
* - BPF verdict/parse programs must use READ_ONCE and xchg operations
*/
-static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
- struct bpf_map *map,
- void *key, u64 flags)
+
+static int __sock_map_ctx_update_elem(struct bpf_map *map,
+ struct bpf_sock_progs *progs,
+ struct sock *sock,
+ struct sock **map_link,
+ void *key)
{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_prog *verdict, *parse, *tx_msg;
struct smap_psock_map_entry *e = NULL;
- struct bpf_prog *verdict, *parse;
- struct sock *osock, *sock;
struct smap_psock *psock;
- u32 i = *(u32 *)key;
- int err;
-
- if (unlikely(flags > BPF_EXIST))
- return -EINVAL;
-
- if (unlikely(i >= stab->map.max_entries))
- return -E2BIG;
-
- sock = READ_ONCE(stab->sock_map[i]);
- if (flags == BPF_EXIST && !sock)
- return -ENOENT;
- else if (flags == BPF_NOEXIST && sock)
- return -EEXIST;
-
- sock = skops->sk;
+ bool new = false;
+ int err = 0;
/* 1. If sock map has BPF programs those will be inherited by the
* sock being added. If the sock is already attached to BPF programs
* this results in an error.
*/
- verdict = READ_ONCE(stab->bpf_verdict);
- parse = READ_ONCE(stab->bpf_parse);
+ verdict = READ_ONCE(progs->bpf_verdict);
+ parse = READ_ONCE(progs->bpf_parse);
+ tx_msg = READ_ONCE(progs->bpf_tx_msg);
if (parse && verdict) {
/* bpf prog refcnt may be zero if a concurrent attach operation
@@ -747,17 +1730,28 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
* we increment the refcnt. If this is the case abort with an
* error.
*/
- verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+ verdict = bpf_prog_inc_not_zero(verdict);
if (IS_ERR(verdict))
return PTR_ERR(verdict);
- parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+ parse = bpf_prog_inc_not_zero(parse);
if (IS_ERR(parse)) {
bpf_prog_put(verdict);
return PTR_ERR(parse);
}
}
+ if (tx_msg) {
+ tx_msg = bpf_prog_inc_not_zero(tx_msg);
+ if (IS_ERR(tx_msg)) {
+ if (parse && verdict) {
+ bpf_prog_put(parse);
+ bpf_prog_put(verdict);
+ }
+ return PTR_ERR(tx_msg);
+ }
+ }
+
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
@@ -772,36 +1766,49 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
err = -EBUSY;
goto out_progs;
}
- psock->refcnt++;
+ if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) {
+ err = -EBUSY;
+ goto out_progs;
+ }
+ if (!refcount_inc_not_zero(&psock->refcnt)) {
+ err = -EAGAIN;
+ goto out_progs;
+ }
} else {
- psock = smap_init_psock(sock, stab);
+ psock = smap_init_psock(sock, map->numa_node);
if (IS_ERR(psock)) {
err = PTR_ERR(psock);
goto out_progs;
}
- err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
- if (err)
- goto out_progs;
-
set_bit(SMAP_TX_RUNNING, &psock->state);
+ new = true;
}
- e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
- if (!e) {
- err = -ENOMEM;
- goto out_progs;
+ if (map_link) {
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e) {
+ err = -ENOMEM;
+ goto out_progs;
+ }
}
- e->entry = &stab->sock_map[i];
/* 3. At this point we have a reference to a valid psock that is
* running. Attach any BPF programs needed.
*/
+ if (tx_msg)
+ bpf_tcp_msg_add(psock, sock, tx_msg);
+ if (new) {
+ err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
+ if (err)
+ goto out_free;
+ }
+
if (parse && verdict && !psock->strp_enabled) {
err = smap_init_sock(psock, sock);
if (err)
goto out_free;
- smap_init_progs(psock, stab, verdict, parse);
+ smap_init_progs(psock, verdict, parse);
smap_start_sock(psock, sock);
}
@@ -810,47 +1817,93 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
* it with. Because we can only have a single set of programs if
* old_sock has a strp we can stop it.
*/
- list_add_tail(&e->list, &psock->maps);
+ if (map_link) {
+ e->entry = map_link;
+ list_add_tail(&e->list, &psock->maps);
+ }
+ write_unlock_bh(&sock->sk_callback_lock);
+ return err;
+out_free:
+ smap_release_sock(psock, sock);
+out_progs:
+ if (parse && verdict) {
+ bpf_prog_put(parse);
+ bpf_prog_put(verdict);
+ }
+ if (tx_msg)
+ bpf_prog_put(tx_msg);
write_unlock_bh(&sock->sk_callback_lock);
+ kfree(e);
+ return err;
+}
+
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+ struct bpf_map *map,
+ void *key, u64 flags)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_sock_progs *progs = &stab->progs;
+ struct sock *osock, *sock;
+ u32 i = *(u32 *)key;
+ int err;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ if (unlikely(i >= stab->map.max_entries))
+ return -E2BIG;
+
+ sock = READ_ONCE(stab->sock_map[i]);
+ if (flags == BPF_EXIST && !sock)
+ return -ENOENT;
+ else if (flags == BPF_NOEXIST && sock)
+ return -EEXIST;
+
+ sock = skops->sk;
+ err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
+ key);
+ if (err)
+ goto out;
osock = xchg(&stab->sock_map[i], sock);
if (osock) {
struct smap_psock *opsock = smap_psock_sk(osock);
write_lock_bh(&osock->sk_callback_lock);
- if (osock != sock && parse)
- smap_stop_sock(opsock, osock);
- smap_list_remove(opsock, &stab->sock_map[i]);
+ smap_list_remove(opsock, &stab->sock_map[i], NULL);
smap_release_sock(opsock, osock);
write_unlock_bh(&osock->sk_callback_lock);
}
- return 0;
-out_free:
- smap_release_sock(psock, sock);
-out_progs:
- if (verdict)
- bpf_prog_put(verdict);
- if (parse)
- bpf_prog_put(parse);
- write_unlock_bh(&sock->sk_callback_lock);
- kfree(e);
+out:
return err;
}
int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_sock_progs *progs;
struct bpf_prog *orig;
- if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
+ if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ progs = &stab->progs;
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ progs = &htab->progs;
+ } else {
return -EINVAL;
+ }
switch (type) {
+ case BPF_SK_MSG_VERDICT:
+ orig = xchg(&progs->bpf_tx_msg, prog);
+ break;
case BPF_SK_SKB_STREAM_PARSER:
- orig = xchg(&stab->bpf_parse, prog);
+ orig = xchg(&progs->bpf_parse, prog);
break;
case BPF_SK_SKB_STREAM_VERDICT:
- orig = xchg(&stab->bpf_verdict, prog);
+ orig = xchg(&progs->bpf_verdict, prog);
break;
default:
return -EOPNOTSUPP;
@@ -896,19 +1949,423 @@ static int sock_map_update_elem(struct bpf_map *map,
return err;
}
-static void sock_map_release(struct bpf_map *map, struct file *map_file)
+static void sock_map_release(struct bpf_map *map)
{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct bpf_sock_progs *progs;
struct bpf_prog *orig;
- orig = xchg(&stab->bpf_parse, NULL);
+ if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ progs = &stab->progs;
+ } else {
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ progs = &htab->progs;
+ }
+
+ orig = xchg(&progs->bpf_parse, NULL);
+ if (orig)
+ bpf_prog_put(orig);
+ orig = xchg(&progs->bpf_verdict, NULL);
if (orig)
bpf_prog_put(orig);
- orig = xchg(&stab->bpf_verdict, NULL);
+
+ orig = xchg(&progs->bpf_tx_msg, NULL);
if (orig)
bpf_prog_put(orig);
}
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int i, err;
+ u64 cost;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->value_size != 4 ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ return ERR_PTR(-E2BIG);
+
+ err = bpf_tcp_ulp_register();
+ if (err && err != -EEXIST)
+ return ERR_PTR(err);
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&htab->map, attr);
+
+ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8);
+ err = -EINVAL;
+ if (htab->n_buckets == 0 ||
+ htab->n_buckets > U32_MAX / sizeof(struct bucket))
+ goto free_htab;
+
+ cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+ (u64) htab->elem_size * htab->map.max_entries;
+
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_htab;
+
+ htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ err = bpf_map_precharge_memlock(htab->map.pages);
+ if (err)
+ goto free_htab;
+
+ err = -ENOMEM;
+ htab->buckets = bpf_map_area_alloc(
+ htab->n_buckets * sizeof(struct bucket),
+ htab->map.numa_node);
+ if (!htab->buckets)
+ goto free_htab;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }
+
+ return &htab->map;
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* At this point no update, lookup or delete operations can happen.
+ * However, be aware we can still get a socket state event updates,
+ * and data ready callabacks that reference the psock from sk_user_data
+ * Also psock worker threads are still in-flight. So smap_release_sock
+ * will only free the psock after cancel_sync on the worker threads
+ * and a grace period expire to ensure psock is really safe to remove.
+ */
+ rcu_read_lock();
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ struct sock *sock = l->sk;
+ struct smap_psock *psock;
+
+ hlist_del_rcu(&l->hash_node);
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ /* This check handles a racing sock event that can get
+ * the sk_callback_lock before this case but after xchg
+ * causing the refcnt to hit zero and sock user data
+ * (psock) to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, NULL, l);
+ smap_release_sock(psock, sock);
+ }
+ write_unlock_bh(&sock->sk_callback_lock);
+ kfree(l);
+ }
+ }
+ rcu_read_unlock();
+ bpf_map_area_free(htab->buckets);
+ kfree(htab);
+}
+
+static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
+ void *key, u32 key_size, u32 hash,
+ struct sock *sk,
+ struct htab_elem *old_elem)
+{
+ struct htab_elem *l_new;
+
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+ if (!old_elem) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ }
+ l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(l_new->key, key, key_size);
+ l_new->sk = sk;
+ l_new->hash = hash;
+ return l_new;
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+ u32 hash, void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node) {
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+ }
+
+ return NULL;
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map,
+ void *key, void *next_key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l, *next_l;
+ struct hlist_head *h;
+ u32 hash, key_size;
+ int i = 0;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+ if (!key)
+ goto find_first_elem;
+ hash = htab_map_hash(key, key_size);
+ h = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(h, hash, key, key_size);
+ if (!l)
+ goto find_first_elem;
+ next_l = hlist_entry_safe(
+ rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+
+ /* no more elements in this hash list, go to the next bucket */
+ i = hash & (htab->n_buckets - 1);
+ i++;
+
+find_first_elem:
+ /* iterate over buckets */
+ for (; i < htab->n_buckets; i++) {
+ h = select_bucket(htab, i);
+
+ /* pick first element in the bucket */
+ next_l = hlist_entry_safe(
+ rcu_dereference_raw(hlist_first_rcu(h)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ /* if it's not empty, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+ }
+
+ /* iterated over all buckets and all elements */
+ return -ENOENT;
+}
+
+static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+ struct bpf_map *map,
+ void *key, u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_sock_progs *progs = &htab->progs;
+ struct htab_elem *l_new = NULL, *l_old;
+ struct smap_psock_map_entry *e = NULL;
+ struct hlist_head *head;
+ struct smap_psock *psock;
+ u32 key_size, hash;
+ struct sock *sock;
+ struct bucket *b;
+ int err;
+
+ sock = skops->sk;
+
+ if (sock->sk_type != SOCK_STREAM ||
+ sock->sk_protocol != IPPROTO_TCP)
+ return -EOPNOTSUPP;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e)
+ return -ENOMEM;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ key_size = map->key_size;
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+ if (err)
+ goto err;
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_bh(&b->lock);
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+ if (l_old && map_flags == BPF_NOEXIST) {
+ err = -EEXIST;
+ goto bucket_err;
+ }
+ if (!l_old && map_flags == BPF_EXIST) {
+ err = -ENOENT;
+ goto bucket_err;
+ }
+
+ l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
+ if (IS_ERR(l_new)) {
+ err = PTR_ERR(l_new);
+ goto bucket_err;
+ }
+
+ psock = smap_psock_sk(sock);
+ if (unlikely(!psock)) {
+ err = -EINVAL;
+ goto bucket_err;
+ }
+
+ e->hash_link = l_new;
+ e->htab = container_of(map, struct bpf_htab, map);
+ list_add_tail(&e->list, &psock->maps);
+
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ psock = smap_psock_sk(l_old->sk);
+
+ hlist_del_rcu(&l_old->hash_node);
+ smap_list_remove(psock, NULL, l_old);
+ smap_release_sock(psock, l_old->sk);
+ free_htab_elem(htab, l_old);
+ }
+ raw_spin_unlock_bh(&b->lock);
+ return 0;
+bucket_err:
+ raw_spin_unlock_bh(&b->lock);
+err:
+ kfree(e);
+ psock = smap_psock_sk(sock);
+ if (psock)
+ smap_release_sock(psock, sock);
+ return err;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_sock_ops_kern skops;
+ u32 fd = *(u32 *)value;
+ struct socket *socket;
+ int err;
+
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ skops.sk = socket->sk;
+ if (!skops.sk) {
+ fput(socket->file);
+ return -EINVAL;
+ }
+
+ err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+ fput(socket->file);
+ return err;
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct bucket *b;
+ struct htab_elem *l;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ key_size = map->key_size;
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ raw_spin_lock_bh(&b->lock);
+ l = lookup_elem_raw(head, hash, key, key_size);
+ if (l) {
+ struct sock *sock = l->sk;
+ struct smap_psock *psock;
+
+ hlist_del_rcu(&l->hash_node);
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ /* This check handles a racing sock event that can get the
+ * sk_callback_lock before this case but after xchg happens
+ * causing the refcnt to hit zero and sock user data (psock)
+ * to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, NULL, l);
+ smap_release_sock(psock, sock);
+ }
+ write_unlock_bh(&sock->sk_callback_lock);
+ free_htab_elem(htab, l);
+ ret = 0;
+ }
+ raw_spin_unlock_bh(&b->lock);
+ return ret;
+}
+
+struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 key_size, hash;
+ struct bucket *b;
+ struct sock *sk;
+
+ key_size = map->key_size;
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ raw_spin_lock_bh(&b->lock);
+ l = lookup_elem_raw(head, hash, key, key_size);
+ sk = l ? l->sk : NULL;
+ raw_spin_unlock_bh(&b->lock);
+ return sk;
+}
+
const struct bpf_map_ops sock_map_ops = {
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
@@ -916,7 +2373,16 @@ const struct bpf_map_ops sock_map_ops = {
.map_get_next_key = sock_map_get_next_key,
.map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_map_delete_elem,
- .map_release = sock_map_release,
+ .map_release_uref = sock_map_release,
+};
+
+const struct bpf_map_ops sock_hash_ops = {
+ .map_alloc = sock_hash_alloc,
+ .map_free = sock_hash_free,
+ .map_lookup_elem = sock_map_lookup,
+ .map_get_next_key = sock_hash_get_next_key,
+ .map_update_elem = sock_hash_update_elem,
+ .map_delete_elem = sock_hash_delete_elem,
};
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
@@ -936,3 +2402,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
.arg3_type = ARG_PTR_TO_MAP_KEY,
.arg4_type = ARG_ANYTHING,
};
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+ .func = bpf_sock_hash_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b0ecf43f5894..b675a3f3d141 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,16 +9,20 @@
#include <linux/filter.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
+#include <linux/elf.h>
+#include <linux/pagemap.h>
+#include <linux/irq_work.h>
#include "percpu_freelist.h"
-#define STACK_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
+ BPF_F_STACK_BUILD_ID)
struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
u32 nr;
- u64 ip[];
+ u64 data[];
};
struct bpf_stack_map {
@@ -29,6 +33,34 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
+/* irq_work to run up_read() for build_id lookup in nmi context */
+struct stack_map_irq_work {
+ struct irq_work irq_work;
+ struct rw_semaphore *sem;
+};
+
+static void do_up_read(struct irq_work *entry)
+{
+ struct stack_map_irq_work *work;
+
+ work = container_of(entry, struct stack_map_irq_work, irq_work);
+ up_read(work->sem);
+ work->sem = NULL;
+}
+
+static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
+
+static inline bool stack_map_use_build_id(struct bpf_map *map)
+{
+ return (map->map_flags & BPF_F_STACK_BUILD_ID);
+}
+
+static inline int stack_map_data_size(struct bpf_map *map)
+{
+ return stack_map_use_build_id(map) ?
+ sizeof(struct bpf_stack_build_id) : sizeof(u64);
+}
+
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +100,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- value_size < 8 || value_size % 8 ||
- value_size / 8 > sysctl_perf_event_max_stack)
+ value_size < 8 || value_size % 8)
+ return ERR_PTR(-EINVAL);
+
+ BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
+ if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
+ if (value_size % sizeof(struct bpf_stack_build_id) ||
+ value_size / sizeof(struct bpf_stack_build_id)
+ > sysctl_perf_event_max_stack)
+ return ERR_PTR(-EINVAL);
+ } else if (value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */
@@ -114,13 +154,194 @@ free_smap:
return ERR_PTR(err);
}
+#define BPF_BUILD_ID 3
+/*
+ * Parse build id from the note segment. This logic can be shared between
+ * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
+ * identical.
+ */
+static inline int stack_map_parse_build_id(void *page_addr,
+ unsigned char *build_id,
+ void *note_start,
+ Elf32_Word note_size)
+{
+ Elf32_Word note_offs = 0, new_offs;
+
+ /* check for overflow */
+ if (note_start < page_addr || note_start + note_size < note_start)
+ return -EINVAL;
+
+ /* only supports note that fits in the first page */
+ if (note_start + note_size > page_addr + PAGE_SIZE)
+ return -EINVAL;
+
+ while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
+ Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+
+ if (nhdr->n_type == BPF_BUILD_ID &&
+ nhdr->n_namesz == sizeof("GNU") &&
+ nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
+ memcpy(build_id,
+ note_start + note_offs +
+ ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
+ BPF_BUILD_ID_SIZE);
+ return 0;
+ }
+ new_offs = note_offs + sizeof(Elf32_Nhdr) +
+ ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
+ if (new_offs <= note_offs) /* overflow */
+ break;
+ note_offs = new_offs;
+ }
+ return -EINVAL;
+}
+
+/* Parse build ID from 32-bit ELF */
+static int stack_map_get_build_id_32(void *page_addr,
+ unsigned char *build_id)
+{
+ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
+ Elf32_Phdr *phdr;
+ int i;
+
+ /* only supports phdr that fits in one page */
+ if (ehdr->e_phnum >
+ (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
+ return -EINVAL;
+
+ phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
+
+ for (i = 0; i < ehdr->e_phnum; ++i)
+ if (phdr[i].p_type == PT_NOTE)
+ return stack_map_parse_build_id(page_addr, build_id,
+ page_addr + phdr[i].p_offset,
+ phdr[i].p_filesz);
+ return -EINVAL;
+}
+
+/* Parse build ID from 64-bit ELF */
+static int stack_map_get_build_id_64(void *page_addr,
+ unsigned char *build_id)
+{
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
+ Elf64_Phdr *phdr;
+ int i;
+
+ /* only supports phdr that fits in one page */
+ if (ehdr->e_phnum >
+ (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
+ return -EINVAL;
+
+ phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
+
+ for (i = 0; i < ehdr->e_phnum; ++i)
+ if (phdr[i].p_type == PT_NOTE)
+ return stack_map_parse_build_id(page_addr, build_id,
+ page_addr + phdr[i].p_offset,
+ phdr[i].p_filesz);
+ return -EINVAL;
+}
+
+/* Parse build ID of ELF file mapped to vma */
+static int stack_map_get_build_id(struct vm_area_struct *vma,
+ unsigned char *build_id)
+{
+ Elf32_Ehdr *ehdr;
+ struct page *page;
+ void *page_addr;
+ int ret;
+
+ /* only works for page backed storage */
+ if (!vma->vm_file)
+ return -EINVAL;
+
+ page = find_get_page(vma->vm_file->f_mapping, 0);
+ if (!page)
+ return -EFAULT; /* page not mapped */
+
+ ret = -EINVAL;
+ page_addr = page_address(page);
+ ehdr = (Elf32_Ehdr *)page_addr;
+
+ /* compare magic x7f "ELF" */
+ if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
+ goto out;
+
+ /* only support executable file and shared object file */
+ if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
+ goto out;
+
+ if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
+ ret = stack_map_get_build_id_32(page_addr, build_id);
+ else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
+ ret = stack_map_get_build_id_64(page_addr, build_id);
+out:
+ put_page(page);
+ return ret;
+}
+
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
+ u64 *ips, u32 trace_nr, bool user)
+{
+ int i;
+ struct vm_area_struct *vma;
+ bool irq_work_busy = false;
+ struct stack_map_irq_work *work = NULL;
+
+ if (in_nmi()) {
+ work = this_cpu_ptr(&up_read_work);
+ if (work->irq_work.flags & IRQ_WORK_BUSY)
+ /* cannot queue more up_read, fallback */
+ irq_work_busy = true;
+ }
+
+ /*
+ * We cannot do up_read() in nmi context. To do build_id lookup
+ * in nmi context, we need to run up_read() in irq_work. We use
+ * a percpu variable to do the irq_work. If the irq_work is
+ * already used by another lookup, we fall back to report ips.
+ *
+ * Same fallback is used for kernel stack (!user) on a stackmap
+ * with build_id.
+ */
+ if (!user || !current || !current->mm || irq_work_busy ||
+ down_read_trylock(&current->mm->mmap_sem) == 0) {
+ /* cannot access current->mm, fall back to ips */
+ for (i = 0; i < trace_nr; i++) {
+ id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+ id_offs[i].ip = ips[i];
+ }
+ return;
+ }
+
+ for (i = 0; i < trace_nr; i++) {
+ vma = find_vma(current->mm, ips[i]);
+ if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
+ /* per entry fall back to ips */
+ id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+ id_offs[i].ip = ips[i];
+ continue;
+ }
+ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
+ - vma->vm_start;
+ id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ }
+
+ if (!work) {
+ up_read(&current->mm->mmap_sem);
+ } else {
+ work->sem = &current->mm->mmap_sem;
+ irq_work_queue(&work->irq_work);
+ }
+}
+
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
- u32 max_depth = map->value_size / 8;
+ u32 max_depth = map->value_size / stack_map_data_size(map);
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -128,6 +349,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
bool user = flags & BPF_F_USER_STACK;
bool kernel = !user;
u64 *ips;
+ bool hash_matches;
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
@@ -156,24 +378,45 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
id = hash & (smap->n_buckets - 1);
bucket = READ_ONCE(smap->buckets[id]);
- if (bucket && bucket->hash == hash) {
- if (flags & BPF_F_FAST_STACK_CMP)
+ hash_matches = bucket && bucket->hash == hash;
+ /* fast cmp */
+ if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
+ return id;
+
+ if (stack_map_use_build_id(map)) {
+ /* for build_id+offset, pop a bucket before slow cmp */
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+ new_bucket->nr = trace_nr;
+ stack_map_get_build_id_offset(
+ (struct bpf_stack_build_id *)new_bucket->data,
+ ips, trace_nr, user);
+ trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
+ if (hash_matches && bucket->nr == trace_nr &&
+ memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
+ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
return id;
- if (bucket->nr == trace_nr &&
- memcmp(bucket->ip, ips, trace_len) == 0)
+ }
+ if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
+ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
+ return -EEXIST;
+ }
+ } else {
+ if (hash_matches && bucket->nr == trace_nr &&
+ memcmp(bucket->data, ips, trace_len) == 0)
return id;
+ if (bucket && !(flags & BPF_F_REUSE_STACKID))
+ return -EEXIST;
+
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+ memcpy(new_bucket->data, ips, trace_len);
}
- /* this call stack is not in the map, try to add it */
- if (bucket && !(flags & BPF_F_REUSE_STACKID))
- return -EEXIST;
-
- new_bucket = (struct stack_map_bucket *)
- pcpu_freelist_pop(&smap->freelist);
- if (unlikely(!new_bucket))
- return -ENOMEM;
-
- memcpy(new_bucket->ip, ips, trace_len);
new_bucket->hash = hash;
new_bucket->nr = trace_nr;
@@ -192,6 +435,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+ bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ bool user = flags & BPF_F_USER_STACK;
+ struct perf_callchain_entry *trace;
+ bool kernel = !user;
+ int err = -EINVAL;
+ u64 *ips;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_USER_BUILD_ID)))
+ goto clear;
+ if (kernel && user_build_id)
+ goto clear;
+
+ elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+ : sizeof(u64);
+ if (unlikely(size % elem_size))
+ goto clear;
+
+ num_elem = size / elem_size;
+ if (sysctl_perf_event_max_stack < num_elem)
+ init_nr = 0;
+ else
+ init_nr = sysctl_perf_event_max_stack - num_elem;
+ trace = get_perf_callchain(regs, init_nr, kernel, user,
+ sysctl_perf_event_max_stack, false, false);
+ if (unlikely(!trace))
+ goto err_fault;
+
+ trace_nr = trace->nr - init_nr;
+ if (trace_nr < skip)
+ goto err_fault;
+
+ trace_nr -= skip;
+ trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+ copy_len = trace_nr * elem_size;
+ ips = trace->ip + skip + init_nr;
+ if (user && user_build_id)
+ stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+ else
+ memcpy(buf, ips, copy_len);
+
+ if (size > copy_len)
+ memset(buf + copy_len, 0, size - copy_len);
+ return copy_len;
+
+err_fault:
+ err = -EFAULT;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+ .func = bpf_get_stack,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
/* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -212,8 +522,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
if (!bucket)
return -ENOENT;
- trace_len = bucket->nr * sizeof(u64);
- memcpy(value, bucket->ip, trace_len);
+ trace_len = bucket->nr * stack_map_data_size(map);
+ memcpy(value, bucket->data, trace_len);
memset(value + trace_len, 0, map->value_size - trace_len);
old_bucket = xchg(&smap->buckets[id], bucket);
@@ -298,3 +608,16 @@ const struct bpf_map_ops stack_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
};
+
+static int __init stack_map_init(void)
+{
+ int cpu;
+ struct stack_map_irq_work *work;
+
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&up_read_work, cpu);
+ init_irq_work(&work->irq_work, do_up_read);
+ }
+ return 0;
+}
+subsys_initcall(stack_map_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e24aa3241387..0fa20624707f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,13 +11,17 @@
*/
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
+#include <linux/bpf_lirc.h>
+#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
#include <linux/file.h>
+#include <linux/fs.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/version.h>
@@ -26,6 +30,8 @@
#include <linux/cred.h>
#include <linux/timekeeping.h>
#include <linux/ctype.h>
+#include <linux/btf.h>
+#include <linux/nospec.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -62,9 +68,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
* copy_from_user() call. However, this is not a concern since this function is
* meant to be a future-proofing of bits.
*/
-static int check_uarg_tail_zero(void __user *uaddr,
- size_t expected_size,
- size_t actual_size)
+int bpf_check_uarg_tail_zero(void __user *uaddr,
+ size_t expected_size,
+ size_t actual_size)
{
unsigned char __user *addr;
unsigned char __user *end;
@@ -102,12 +108,14 @@ const struct bpf_map_ops bpf_map_offload_ops = {
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
const struct bpf_map_ops *ops;
+ u32 type = attr->map_type;
struct bpf_map *map;
int err;
- if (attr->map_type >= ARRAY_SIZE(bpf_map_types))
+ if (type >= ARRAY_SIZE(bpf_map_types))
return ERR_PTR(-EINVAL);
- ops = bpf_map_types[attr->map_type];
+ type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
+ ops = bpf_map_types[type];
if (!ops)
return ERR_PTR(-EINVAL);
@@ -122,7 +130,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
if (IS_ERR(map))
return map;
map->ops = ops;
- map->map_type = attr->map_type;
+ map->map_type = type;
return map;
}
@@ -203,11 +211,13 @@ static int bpf_map_alloc_id(struct bpf_map *map)
{
int id;
+ idr_preload(GFP_KERNEL);
spin_lock_bh(&map_idr_lock);
id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
if (id > 0)
map->id = id;
spin_unlock_bh(&map_idr_lock);
+ idr_preload_end();
if (WARN_ON_ONCE(!id))
return -ENOSPC;
@@ -255,8 +265,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
static void bpf_map_put_uref(struct bpf_map *map)
{
if (atomic_dec_and_test(&map->usercnt)) {
- if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
- bpf_fd_array_map_clear(map);
+ if (map->ops->map_release_uref)
+ map->ops->map_release_uref(map);
}
}
@@ -268,6 +278,7 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
if (atomic_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map, do_idr_lock);
+ btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
schedule_work(&map->work);
}
@@ -277,6 +288,7 @@ void bpf_map_put(struct bpf_map *map)
{
__bpf_map_put(map, true);
}
+EXPORT_SYMBOL_GPL(bpf_map_put);
void bpf_map_put_with_uref(struct bpf_map *map)
{
@@ -315,13 +327,15 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"value_size:\t%u\n"
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
- "memlock:\t%llu\n",
+ "memlock:\t%llu\n"
+ "map_id:\t%u\n",
map->map_type,
map->key_size,
map->value_size,
map->max_entries,
map->map_flags,
- map->pages * 1ULL << PAGE_SHIFT);
+ map->pages * 1ULL << PAGE_SHIFT,
+ map->id);
if (owner_prog_type) {
seq_printf(m, "owner_prog_type:\t%u\n",
@@ -413,7 +427,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
return 0;
}
-#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -447,6 +461,33 @@ static int map_create(union bpf_attr *attr)
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
+ if (bpf_map_support_seq_show(map) &&
+ (attr->btf_key_type_id || attr->btf_value_type_id)) {
+ struct btf *btf;
+
+ if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
+ err = -EINVAL;
+ goto free_map_nouncharge;
+ }
+
+ btf = btf_get_by_fd(attr->btf_fd);
+ if (IS_ERR(btf)) {
+ err = PTR_ERR(btf);
+ goto free_map_nouncharge;
+ }
+
+ err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
+ attr->btf_value_type_id);
+ if (err) {
+ btf_put(btf);
+ goto free_map_nouncharge;
+ }
+
+ map->btf = btf;
+ map->btf_key_type_id = attr->btf_key_type_id;
+ map->btf_value_type_id = attr->btf_value_type_id;
+ }
+
err = security_bpf_map_alloc(map);
if (err)
goto free_map_nouncharge;
@@ -471,7 +512,6 @@ static int map_create(union bpf_attr *attr)
return err;
}
- trace_bpf_map_create(map, err);
return err;
free_map:
@@ -479,6 +519,7 @@ free_map:
free_map_sec:
security_bpf_map_free(map);
free_map_nouncharge:
+ btf_put(map->btf);
map->ops->map_free(map);
return err;
}
@@ -511,6 +552,7 @@ struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
atomic_inc(&map->usercnt);
return map;
}
+EXPORT_SYMBOL_GPL(bpf_map_inc);
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
@@ -630,7 +672,6 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_to_user(uvalue, value, value_size) != 0)
goto free_value;
- trace_bpf_map_lookup_elem(map, ufd, key, value);
err = 0;
free_value:
@@ -727,8 +768,6 @@ static int map_update_elem(union bpf_attr *attr)
__this_cpu_dec(bpf_prog_active);
preempt_enable();
out:
- if (!err)
- trace_bpf_map_update_elem(map, ufd, key, value);
free_value:
kfree(value);
free_key:
@@ -781,8 +820,6 @@ static int map_delete_elem(union bpf_attr *attr)
__this_cpu_dec(bpf_prog_active);
preempt_enable();
out:
- if (!err)
- trace_bpf_map_delete_elem(map, ufd, key);
kfree(key);
err_put:
fdput(f);
@@ -846,7 +883,6 @@ out:
if (copy_to_user(unext_key, next_key, map->key_size) != 0)
goto free_next_key;
- trace_bpf_map_next_key(map, ufd, key, next_key);
err = 0;
free_next_key:
@@ -869,11 +905,17 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
{
- if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
+ const struct bpf_prog_ops *ops;
+
+ if (type >= ARRAY_SIZE(bpf_prog_types))
+ return -EINVAL;
+ type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
+ ops = bpf_prog_types[type];
+ if (!ops)
return -EINVAL;
if (!bpf_prog_is_dev_bound(prog->aux))
- prog->aux->ops = bpf_prog_types[type];
+ prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
prog->type = type;
@@ -940,11 +982,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
{
int id;
+ idr_preload(GFP_KERNEL);
spin_lock_bh(&prog_idr_lock);
id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
if (id > 0)
prog->aux->id = id;
spin_unlock_bh(&prog_idr_lock);
+ idr_preload_end();
/* id is in [1, INT_MAX) */
if (WARN_ON_ONCE(!id))
@@ -992,7 +1036,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
if (atomic_dec_and_test(&prog->aux->refcnt)) {
int i;
- trace_bpf_prog_put_rcu(prog);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
@@ -1029,11 +1072,13 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
"prog_tag:\t%s\n"
- "memlock:\t%llu\n",
+ "memlock:\t%llu\n"
+ "prog_id:\t%u\n",
prog->type,
prog->jited,
prog_tag,
- prog->pages * 1ULL << PAGE_SHIFT);
+ prog->pages * 1ULL << PAGE_SHIFT,
+ prog->aux->id);
}
#endif
@@ -1159,16 +1204,69 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
bool attach_drv)
{
- struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
-
- if (!IS_ERR(prog))
- trace_bpf_prog_get_type(prog);
- return prog;
+ return __bpf_prog_get(ufd, &type, attach_drv);
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
+/* Initially all BPF programs could be loaded w/o specifying
+ * expected_attach_type. Later for some of them specifying expected_attach_type
+ * at load time became required so that program could be validated properly.
+ * Programs of types that are allowed to be loaded both w/ and w/o (for
+ * backward compatibility) expected_attach_type, should have the default attach
+ * type assigned to expected_attach_type for the latter case, so that it can be
+ * validated later at attach time.
+ *
+ * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
+ * prog type requires it but has some attach types that have to be backward
+ * compatible.
+ */
+static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
+{
+ switch (attr->prog_type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
+ * exist so checking for non-zero is the way to go here.
+ */
+ if (!attr->expected_attach_type)
+ attr->expected_attach_type =
+ BPF_CGROUP_INET_SOCK_CREATE;
+ break;
+ }
+}
+
+static int
+bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
+ enum bpf_attach_type expected_attach_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ default:
+ return 0;
+ }
+}
+
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex
+#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -1205,11 +1303,17 @@ static int bpf_prog_load(union bpf_attr *attr)
!capable(CAP_SYS_ADMIN))
return -EPERM;
+ bpf_prog_load_fixup_attach_type(attr);
+ if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type))
+ return -EINVAL;
+
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
return -ENOMEM;
+ prog->expected_attach_type = attr->expected_attach_type;
+
prog->aux->offload_requested = !!attr->prog_ifindex;
err = security_bpf_prog_alloc(prog->aux);
@@ -1277,7 +1381,6 @@ static int bpf_prog_load(union bpf_attr *attr)
}
bpf_prog_kallsyms_add(prog);
- trace_bpf_prog_load(prog, err);
return err;
free_used_maps:
@@ -1311,11 +1414,99 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
}
+struct bpf_raw_tracepoint {
+ struct bpf_raw_event_map *btp;
+ struct bpf_prog *prog;
+};
+
+static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_raw_tracepoint *raw_tp = filp->private_data;
+
+ if (raw_tp->prog) {
+ bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
+ bpf_prog_put(raw_tp->prog);
+ }
+ kfree(raw_tp);
+ return 0;
+}
+
+static const struct file_operations bpf_raw_tp_fops = {
+ .release = bpf_raw_tracepoint_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+};
+
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+
+static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+{
+ struct bpf_raw_tracepoint *raw_tp;
+ struct bpf_raw_event_map *btp;
+ struct bpf_prog *prog;
+ char tp_name[128];
+ int tp_fd, err;
+
+ if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
+ sizeof(tp_name) - 1) < 0)
+ return -EFAULT;
+ tp_name[sizeof(tp_name) - 1] = 0;
+
+ btp = bpf_find_raw_tracepoint(tp_name);
+ if (!btp)
+ return -ENOENT;
+
+ raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
+ if (!raw_tp)
+ return -ENOMEM;
+ raw_tp->btp = btp;
+
+ prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
+ BPF_PROG_TYPE_RAW_TRACEPOINT);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto out_free_tp;
+ }
+
+ err = bpf_probe_register(raw_tp->btp, prog);
+ if (err)
+ goto out_put_prog;
+
+ raw_tp->prog = prog;
+ tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp,
+ O_CLOEXEC);
+ if (tp_fd < 0) {
+ bpf_probe_unregister(raw_tp->btp, prog);
+ err = tp_fd;
+ goto out_put_prog;
+ }
+ return tp_fd;
+
+out_put_prog:
+ bpf_prog_put(prog);
+out_free_tp:
+ kfree(raw_tp);
+ return err;
+}
+
#ifdef CONFIG_CGROUP_BPF
+static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
+{
+ switch (prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ default:
+ return 0;
+ }
+}
+
#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
-static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
+static int sockmap_get_from_fd(const union bpf_attr *attr,
+ int type, bool attach)
{
struct bpf_prog *prog = NULL;
int ufd = attr->target_fd;
@@ -1329,8 +1520,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
return PTR_ERR(map);
if (attach) {
- prog = bpf_prog_get_type(attr->attach_bpf_fd,
- BPF_PROG_TYPE_SK_SKB);
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, type);
if (IS_ERR(prog)) {
fdput(f);
return PTR_ERR(prog);
@@ -1374,17 +1564,31 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_SKB;
break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
+ break;
case BPF_CGROUP_SOCK_OPS:
ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
case BPF_CGROUP_DEVICE:
ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
+ case BPF_SK_MSG_VERDICT:
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true);
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- return sockmap_get_from_fd(attr, true);
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+ case BPF_LIRC_MODE2:
+ return lirc_prog_attach(attr);
default:
return -EINVAL;
}
@@ -1393,6 +1597,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (IS_ERR(prog))
return PTR_ERR(prog);
+ if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp)) {
bpf_prog_put(prog);
@@ -1429,17 +1638,31 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_SKB;
break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
+ break;
case BPF_CGROUP_SOCK_OPS:
ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
case BPF_CGROUP_DEVICE:
ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
+ case BPF_SK_MSG_VERDICT:
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false);
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- return sockmap_get_from_fd(attr, false);
+ return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+ case BPF_LIRC_MODE2:
+ return lirc_prog_detach(attr);
default:
return -EINVAL;
}
@@ -1478,9 +1701,19 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
break;
+ case BPF_LIRC_MODE2:
+ return lirc_prog_query(attr, uattr);
default:
return -EINVAL;
}
@@ -1687,7 +1920,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
u32 ulen;
int err;
- err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -1700,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.load_time = prog->aux->load_time;
info.created_by_uid = from_kuid_munged(current_user_ns(),
prog->aux->user->uid);
+ info.gpl_compatible = prog->gpl_compatible;
memcpy(info.tag, prog->tag, sizeof(prog->tag));
memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
@@ -1720,6 +1954,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
+ info.nr_jited_ksyms = 0;
goto done;
}
@@ -1756,18 +1991,93 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
* for offload.
*/
ulen = info.jited_prog_len;
- info.jited_prog_len = prog->jited_len;
+ if (prog->aux->func_cnt) {
+ u32 i;
+
+ info.jited_prog_len = 0;
+ for (i = 0; i < prog->aux->func_cnt; i++)
+ info.jited_prog_len += prog->aux->func[i]->jited_len;
+ } else {
+ info.jited_prog_len = prog->jited_len;
+ }
+
if (info.jited_prog_len && ulen) {
if (bpf_dump_raw_ok()) {
uinsns = u64_to_user_ptr(info.jited_prog_insns);
ulen = min_t(u32, info.jited_prog_len, ulen);
- if (copy_to_user(uinsns, prog->bpf_func, ulen))
- return -EFAULT;
+
+ /* for multi-function programs, copy the JITed
+ * instructions for all the functions
+ */
+ if (prog->aux->func_cnt) {
+ u32 len, free, i;
+ u8 *img;
+
+ free = ulen;
+ for (i = 0; i < prog->aux->func_cnt; i++) {
+ len = prog->aux->func[i]->jited_len;
+ len = min_t(u32, len, free);
+ img = (u8 *) prog->aux->func[i]->bpf_func;
+ if (copy_to_user(uinsns, img, len))
+ return -EFAULT;
+ uinsns += len;
+ free -= len;
+ if (!free)
+ break;
+ }
+ } else {
+ if (copy_to_user(uinsns, prog->bpf_func, ulen))
+ return -EFAULT;
+ }
} else {
info.jited_prog_insns = 0;
}
}
+ ulen = info.nr_jited_ksyms;
+ info.nr_jited_ksyms = prog->aux->func_cnt;
+ if (info.nr_jited_ksyms && ulen) {
+ if (bpf_dump_raw_ok()) {
+ u64 __user *user_ksyms;
+ ulong ksym_addr;
+ u32 i;
+
+ /* copy the address of the kernel symbol
+ * corresponding to each function
+ */
+ ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+ user_ksyms = u64_to_user_ptr(info.jited_ksyms);
+ for (i = 0; i < ulen; i++) {
+ ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
+ ksym_addr &= PAGE_MASK;
+ if (put_user((u64) ksym_addr, &user_ksyms[i]))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_ksyms = 0;
+ }
+ }
+
+ ulen = info.nr_jited_func_lens;
+ info.nr_jited_func_lens = prog->aux->func_cnt;
+ if (info.nr_jited_func_lens && ulen) {
+ if (bpf_dump_raw_ok()) {
+ u32 __user *user_lens;
+ u32 func_len, i;
+
+ /* copy the JITed image lengths for each function */
+ ulen = min_t(u32, info.nr_jited_func_lens, ulen);
+ user_lens = u64_to_user_ptr(info.jited_func_lens);
+ for (i = 0; i < ulen; i++) {
+ func_len = prog->aux->func[i]->jited_len;
+ if (put_user(func_len, &user_lens[i]))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_func_lens = 0;
+ }
+ }
+
done:
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -1785,7 +2095,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
u32 info_len = attr->info.info_len;
int err;
- err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
@@ -1798,6 +2108,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.map_flags = map->map_flags;
memcpy(info.name, map->name, sizeof(map->name));
+ if (map->btf) {
+ info.btf_id = btf_id(map->btf);
+ info.btf_key_type_id = map->btf_key_type_id;
+ info.btf_value_type_id = map->btf_value_type_id;
+ }
+
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_info_fill(&info, map);
if (err)
@@ -1811,6 +2127,21 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
return 0;
}
+static int bpf_btf_get_info_by_fd(struct btf *btf,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+ if (err)
+ return err;
+
+ return btf_get_info_by_fd(btf, attr, uattr);
+}
+
#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -1833,6 +2164,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
else if (f.file->f_op == &bpf_map_fops)
err = bpf_map_get_info_by_fd(f.file->private_data, attr,
uattr);
+ else if (f.file->f_op == &btf_fops)
+ err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
else
err = -EINVAL;
@@ -1840,15 +2173,167 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
return err;
}
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+
+static int bpf_btf_load(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_BTF_LOAD))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btf_new_fd(attr);
+}
+
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
+
+static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btf_get_fd_by_id(attr->btf_id);
+}
+
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ u32 prog_id, u32 fd_type,
+ const char *buf, u64 probe_offset,
+ u64 probe_addr)
+{
+ char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+ u32 len = buf ? strlen(buf) : 0, input_len;
+ int err = 0;
+
+ if (put_user(len, &uattr->task_fd_query.buf_len))
+ return -EFAULT;
+ input_len = attr->task_fd_query.buf_len;
+ if (input_len && ubuf) {
+ if (!len) {
+ /* nothing to copy, just make ubuf NULL terminated */
+ char zero = '\0';
+
+ if (put_user(zero, ubuf))
+ return -EFAULT;
+ } else if (input_len >= len + 1) {
+ /* ubuf can hold the string with NULL terminator */
+ if (copy_to_user(ubuf, buf, len + 1))
+ return -EFAULT;
+ } else {
+ /* ubuf cannot hold the string with NULL terminator,
+ * do a partial copy with NULL terminator.
+ */
+ char zero = '\0';
+
+ err = -ENOSPC;
+ if (copy_to_user(ubuf, buf, input_len - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + input_len - 1))
+ return -EFAULT;
+ }
+ }
+
+ if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
+ put_user(fd_type, &uattr->task_fd_query.fd_type) ||
+ put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
+ put_user(probe_addr, &uattr->task_fd_query.probe_addr))
+ return -EFAULT;
+
+ return err;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ pid_t pid = attr->task_fd_query.pid;
+ u32 fd = attr->task_fd_query.fd;
+ const struct perf_event *event;
+ struct files_struct *files;
+ struct task_struct *task;
+ struct file *file;
+ int err;
+
+ if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (attr->task_fd_query.flags != 0)
+ return -EINVAL;
+
+ task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ if (!task)
+ return -ENOENT;
+
+ files = get_files_struct(task);
+ put_task_struct(task);
+ if (!files)
+ return -ENOENT;
+
+ err = 0;
+ spin_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (!file)
+ err = -EBADF;
+ else
+ get_file(file);
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+
+ if (err)
+ goto out;
+
+ if (file->f_op == &bpf_raw_tp_fops) {
+ struct bpf_raw_tracepoint *raw_tp = file->private_data;
+ struct bpf_raw_event_map *btp = raw_tp->btp;
+
+ err = bpf_task_fd_query_copy(attr, uattr,
+ raw_tp->prog->aux->id,
+ BPF_FD_TYPE_RAW_TRACEPOINT,
+ btp->tp->name, 0, 0);
+ goto put_file;
+ }
+
+ event = perf_get_event(file);
+ if (!IS_ERR(event)) {
+ u64 probe_offset, probe_addr;
+ u32 prog_id, fd_type;
+ const char *buf;
+
+ err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
+ &buf, &probe_offset,
+ &probe_addr);
+ if (!err)
+ err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+ fd_type, buf,
+ probe_offset,
+ probe_addr);
+ goto put_file;
+ }
+
+ err = -ENOTSUPP;
+put_file:
+ fput(file);
+out:
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
int err;
- if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
+ if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
return -EPERM;
- err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+ err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
size = min_t(u32, size, sizeof(attr));
@@ -1917,6 +2402,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET_INFO_BY_FD:
err = bpf_obj_get_info_by_fd(&attr, uattr);
break;
+ case BPF_RAW_TRACEPOINT_OPEN:
+ err = bpf_raw_tracepoint_open(&attr);
+ break;
+ case BPF_BTF_LOAD:
+ err = bpf_btf_load(&attr);
+ break;
+ case BPF_BTF_GET_FD_BY_ID:
+ err = bpf_btf_get_fd_by_id(&attr);
+ break;
+ case BPF_TASK_FD_QUERY:
+ err = bpf_task_fd_query(&attr, uattr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 1f4bf68c12db..938d41211be7 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
return TNUM(a.value >> shift, a.mask >> shift);
}
+struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+{
+ /* if a.value is negative, arithmetic shifting by minimum shift
+ * will have larger negative offset compared to more shifting.
+ * If a.value is nonnegative, arithmetic shifting by minimum shift
+ * will have larger positive offset compare to more shifting.
+ */
+ return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+}
+
struct tnum tnum_add(struct tnum a, struct tnum b)
{
u64 sm, sv, sigma, chi, mu;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c6eff108aa99..d6403b5166f4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22,6 +22,7 @@
#include <linux/stringify.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/perf_event.h>
#include "disasm.h"
@@ -156,7 +157,29 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
-#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
+#define BPF_MAP_PTR_UNPRIV 1UL
+#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
+ POISON_POINTER_DELTA))
+#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+
+static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
+{
+ return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
+}
+
+static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
+{
+ return aux->map_state & BPF_MAP_PTR_UNPRIV;
+}
+
+static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
+ const struct bpf_map *map, bool unpriv)
+{
+ BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
+ unpriv |= bpf_map_ptr_unpriv(aux);
+ aux->map_state = (unsigned long)map |
+ (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+}
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
@@ -164,27 +187,18 @@ struct bpf_call_arg_meta {
bool pkt_access;
int regno;
int access_size;
+ s64 msize_smax_value;
+ u64 msize_umax_value;
};
static DEFINE_MUTEX(bpf_verifier_lock);
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
- const char *fmt, ...)
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+ va_list args)
{
- struct bpf_verifer_log *log = &env->log;
unsigned int n;
- va_list args;
- if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
- return;
-
- va_start(args, fmt);
n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
- va_end(args);
WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
"verifier log line truncated - local buffer too short\n");
@@ -197,14 +211,37 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
else
log->ubuf = NULL;
}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-/* Historically bpf_verifier_log_write was called verbose, but the name was too
- * generic for symbol export. The function was renamed, but not the calls in
- * the verifier to avoid complicating backports. Hence the alias below.
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
*/
-static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
- const char *fmt, ...)
- __attribute__((alias("bpf_verifier_log_write")));
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
+{
+ struct bpf_verifier_env *env = private_data;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
@@ -508,10 +545,6 @@ err:
static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-#define CALLEE_SAVED_REGS 5
-static const int callee_saved[CALLEE_SAVED_REGS] = {
- BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9
-};
static void __mark_reg_not_init(struct bpf_reg_state *reg);
@@ -730,18 +763,19 @@ enum reg_arg_type {
static int cmp_subprogs(const void *a, const void *b)
{
- return *(int *)a - *(int *)b;
+ return ((struct bpf_subprog_info *)a)->start -
+ ((struct bpf_subprog_info *)b)->start;
}
static int find_subprog(struct bpf_verifier_env *env, int off)
{
- u32 *p;
+ struct bpf_subprog_info *p;
- p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
- sizeof(env->subprog_starts[0]), cmp_subprogs);
+ p = bsearch(&off, env->subprog_info, env->subprog_cnt,
+ sizeof(env->subprog_info[0]), cmp_subprogs);
if (!p)
return -ENOENT;
- return p - env->subprog_starts;
+ return p - env->subprog_info;
}
@@ -761,18 +795,24 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
verbose(env, "too many subprograms\n");
return -E2BIG;
}
- env->subprog_starts[env->subprog_cnt++] = off;
- sort(env->subprog_starts, env->subprog_cnt,
- sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
+ env->subprog_info[env->subprog_cnt++].start = off;
+ sort(env->subprog_info, env->subprog_cnt,
+ sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
return 0;
}
static int check_subprogs(struct bpf_verifier_env *env)
{
int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
+ struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
+ /* Add entry function. */
+ ret = add_subprog(env, 0);
+ if (ret < 0)
+ return ret;
+
/* determine subprog starts. The end is one before the next starts */
for (i = 0; i < insn_cnt; i++) {
if (insn[i].code != (BPF_JMP | BPF_CALL))
@@ -792,16 +832,18 @@ static int check_subprogs(struct bpf_verifier_env *env)
return ret;
}
+ /* Add a fake 'exit' subprog which could simplify subprog iteration
+ * logic. 'subprog_cnt' should not be increased.
+ */
+ subprog[env->subprog_cnt].start = insn_cnt;
+
if (env->log.level > 1)
for (i = 0; i < env->subprog_cnt; i++)
- verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);
+ verbose(env, "func#%d @%d\n", i, subprog[i].start);
/* now check that all jumps are within the same subprog */
- subprog_start = 0;
- if (env->subprog_cnt == cur_subprog)
- subprog_end = insn_cnt;
- else
- subprog_end = env->subprog_starts[cur_subprog++];
+ subprog_start = subprog[cur_subprog].start;
+ subprog_end = subprog[cur_subprog + 1].start;
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;
@@ -826,10 +868,9 @@ next:
return -EINVAL;
}
subprog_start = subprog_end;
- if (env->subprog_cnt == cur_subprog)
- subprog_end = insn_cnt;
- else
- subprog_end = env->subprog_starts[cur_subprog++];
+ cur_subprog++;
+ if (cur_subprog < env->subprog_cnt)
+ subprog_end = subprog[cur_subprog + 1].start;
}
}
return 0;
@@ -970,7 +1011,7 @@ static bool register_is_null(struct bpf_reg_state *reg)
*/
static int check_stack_write(struct bpf_verifier_env *env,
struct bpf_func_state *state, /* func where register points to */
- int off, int size, int value_regno)
+ int off, int size, int value_regno, int insn_idx)
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
@@ -1009,8 +1050,33 @@ static int check_stack_write(struct bpf_verifier_env *env,
state->stack[spi].spilled_ptr = cur->regs[value_regno];
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- for (i = 0; i < BPF_REG_SIZE; i++)
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack[spi].slot_type[i] == STACK_MISC &&
+ !env->allow_ptr_leaks) {
+ int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+ int soff = (-spi - 1) * BPF_REG_SIZE;
+
+ /* detected reuse of integer stack slot with a pointer
+ * which means either llvm is reusing stack slot or
+ * an attacker is trying to exploit CVE-2018-3639
+ * (speculative store bypass)
+ * Have to sanitize that slot with preemptive
+ * store of zero.
+ */
+ if (*poff && *poff != soff) {
+ /* disallow programs where single insn stores
+ * into two different stack slots, since verifier
+ * cannot sanitize them
+ */
+ verbose(env,
+ "insn %d cannot access two stack slots fp%d and fp%d",
+ insn_idx, *poff, soff);
+ return -EINVAL;
+ }
+ *poff = soff;
+ }
state->stack[spi].slot_type[i] = STACK_SPILL;
+ }
} else {
u8 type = STACK_MISC;
@@ -1243,6 +1309,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
switch (env->prog->type) {
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
/* dst_input() and dst_output() can't write for now */
if (t == BPF_WRITE)
return false;
@@ -1252,6 +1319,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_XDP:
case BPF_PROG_TYPE_LWT_XMIT:
case BPF_PROG_TYPE_SK_SKB:
+ case BPF_PROG_TYPE_SK_MSG:
if (meta)
return meta->pkt_access;
@@ -1314,7 +1382,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
};
if (env->ops->is_valid_access &&
- env->ops->is_valid_access(off, size, t, &info)) {
+ env->ops->is_valid_access(off, size, t, env->prog, &info)) {
/* A non zero info.ctx_field_size indicates that this field is a
* candidate for later verifier transformation to load the whole
* field and then apply a mask when accessed with a narrower
@@ -1461,13 +1529,13 @@ static int update_stack_depth(struct bpf_verifier_env *env,
const struct bpf_func_state *func,
int off)
{
- u16 stack = env->subprog_stack_depth[func->subprogno];
+ u16 stack = env->subprog_info[func->subprogno].stack_depth;
if (stack >= -off)
return 0;
/* update known max for given subprogram */
- env->subprog_stack_depth[func->subprogno] = -off;
+ env->subprog_info[func->subprogno].stack_depth = -off;
return 0;
}
@@ -1479,9 +1547,9 @@ static int update_stack_depth(struct bpf_verifier_env *env,
*/
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
- int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+ int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
+ struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
@@ -1489,17 +1557,14 @@ process_func:
/* round up to 32-bytes, since this is granularity
* of interpreter stack size
*/
- depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+ depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
if (depth > MAX_BPF_STACK) {
verbose(env, "combined stack size of %d calls is %d. Too large\n",
frame + 1, depth);
return -EACCES;
}
continue_func:
- if (env->subprog_cnt == subprog)
- subprog_end = insn_cnt;
- else
- subprog_end = env->subprog_starts[subprog];
+ subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
if (insn[i].code != (BPF_JMP | BPF_CALL))
continue;
@@ -1507,17 +1572,16 @@ continue_func:
continue;
/* remember insn and function to return to */
ret_insn[frame] = i + 1;
- ret_prog[frame] = subprog;
+ ret_prog[frame] = idx;
/* find the callee */
i = i + insn[i].imm + 1;
- subprog = find_subprog(env, i);
- if (subprog < 0) {
+ idx = find_subprog(env, i);
+ if (idx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
i);
return -EFAULT;
}
- subprog++;
frame++;
if (frame >= MAX_CALL_FRAMES) {
WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
@@ -1530,10 +1594,10 @@ continue_func:
*/
if (frame == 0)
return 0;
- depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+ depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
frame--;
i = ret_insn[frame];
- subprog = ret_prog[frame];
+ idx = ret_prog[frame];
goto continue_func;
}
@@ -1549,8 +1613,7 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
start);
return -EFAULT;
}
- subprog++;
- return env->subprog_stack_depth[subprog];
+ return env->subprog_info[subprog].stack_depth;
}
#endif
@@ -1685,7 +1748,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (t == BPF_WRITE)
err = check_stack_write(env, state, off, size,
- value_regno);
+ value_regno, insn_idx);
else
err = check_stack_read(env, state, off, size,
value_regno);
@@ -1905,7 +1968,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
- if (!type_is_pkt_pointer(type) &&
+ if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
@@ -1957,14 +2020,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
- if (type_is_pkt_pointer(type))
- err = check_packet_access(env, regno, reg->off,
- meta->map_ptr->key_size,
- false);
- else
- err = check_stack_boundary(env, regno,
- meta->map_ptr->key_size,
- false, NULL);
+ err = check_helper_mem_access(env, regno,
+ meta->map_ptr->key_size, false,
+ NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
@@ -1974,17 +2032,18 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
- if (type_is_pkt_pointer(type))
- err = check_packet_access(env, regno, reg->off,
- meta->map_ptr->value_size,
- false);
- else
- err = check_stack_boundary(env, regno,
- meta->map_ptr->value_size,
- false, NULL);
+ err = check_helper_mem_access(env, regno,
+ meta->map_ptr->value_size, false,
+ NULL);
} else if (arg_type_is_mem_size(arg_type)) {
bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
+ /* remember the mem_size which may be used later
+ * to refine return values.
+ */
+ meta->msize_smax_value = reg->smax_value;
+ meta->msize_umax_value = reg->umax_value;
+
/* The register is SCALAR_VALUE; the access check
* happens using its boundaries.
*/
@@ -2062,8 +2121,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
- /* Restrict bpf side of cpumap, open when use-cases appear */
+ /* Restrict bpf side of cpumap and xskmap, open when use-cases
+ * appear.
+ */
case BPF_MAP_TYPE_CPUMAP:
+ case BPF_MAP_TYPE_XSKMAP:
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
@@ -2075,7 +2137,15 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_SOCKMAP:
if (func_id != BPF_FUNC_sk_redirect_map &&
func_id != BPF_FUNC_sock_map_update &&
- func_id != BPF_FUNC_map_delete_elem)
+ func_id != BPF_FUNC_map_delete_elem &&
+ func_id != BPF_FUNC_msg_redirect_map)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_SOCKHASH:
+ if (func_id != BPF_FUNC_sk_redirect_hash &&
+ func_id != BPF_FUNC_sock_hash_update &&
+ func_id != BPF_FUNC_map_delete_elem &&
+ func_id != BPF_FUNC_msg_redirect_hash)
goto error;
break;
default:
@@ -2087,7 +2157,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_tail_call:
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
goto error;
- if (env->subprog_cnt) {
+ if (env->subprog_cnt > 1) {
verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
return -EINVAL;
}
@@ -2109,15 +2179,20 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_FUNC_redirect_map:
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
- map->map_type != BPF_MAP_TYPE_CPUMAP)
+ map->map_type != BPF_MAP_TYPE_CPUMAP &&
+ map->map_type != BPF_MAP_TYPE_XSKMAP)
goto error;
break;
case BPF_FUNC_sk_redirect_map:
+ case BPF_FUNC_msg_redirect_map:
+ case BPF_FUNC_sock_map_update:
if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
goto error;
break;
- case BPF_FUNC_sock_map_update:
- if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ case BPF_FUNC_sk_redirect_hash:
+ case BPF_FUNC_msg_redirect_hash:
+ case BPF_FUNC_sock_hash_update:
+ if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
goto error;
break;
default:
@@ -2258,7 +2333,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* remember the callsite, it will be used by bpf_exit */
*insn_idx /* callsite */,
state->curframe + 1 /* frameno within this callchain */,
- subprog + 1 /* subprog number within this prog */);
+ subprog /* subprog number within this prog */);
/* copy r1 - r5 args that callee can access */
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
@@ -2322,6 +2397,49 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return 0;
}
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
+ int func_id,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+
+ if (ret_type != RET_INTEGER ||
+ (func_id != BPF_FUNC_get_stack &&
+ func_id != BPF_FUNC_probe_read_str))
+ return;
+
+ ret_reg->smax_value = meta->msize_smax_value;
+ ret_reg->umax_value = meta->msize_umax_value;
+ __reg_deduce_bounds(ret_reg);
+ __reg_bound_offset(ret_reg);
+}
+
+static int
+record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+ int func_id, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+
+ if (func_id != BPF_FUNC_tail_call &&
+ func_id != BPF_FUNC_map_lookup_elem &&
+ func_id != BPF_FUNC_map_update_elem &&
+ func_id != BPF_FUNC_map_delete_elem)
+ return 0;
+
+ if (meta->map_ptr == NULL) {
+ verbose(env, "kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+
+ if (!BPF_MAP_PTR(aux->map_state))
+ bpf_map_ptr_store(aux, meta->map_ptr,
+ meta->map_ptr->unpriv_array);
+ else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
+ bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
+ meta->map_ptr->unpriv_array);
+ return 0;
+}
+
static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
const struct bpf_func_proto *fn = NULL;
@@ -2338,7 +2456,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
}
if (env->ops->get_func_proto)
- fn = env->ops->get_func_proto(func_id);
+ fn = env->ops->get_func_proto(func_id, env->prog);
if (!fn) {
verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
func_id);
@@ -2347,7 +2465,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
/* eBPF programs must be GPL compatible to use GPL-ed functions */
if (!env->prog->gpl_compatible && fn->gpl_only) {
- verbose(env, "cannot call GPL only function from proprietary program\n");
+ verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n");
return -EINVAL;
}
@@ -2376,13 +2494,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
- if (func_id == BPF_FUNC_tail_call) {
- if (meta.map_ptr == NULL) {
- verbose(env, "verifier bug\n");
- return -EINVAL;
- }
- env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
- }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
@@ -2393,6 +2504,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (err)
return err;
+ err = record_func_map(env, &meta, func_id, insn_idx);
+ if (err)
+ return err;
+
/* Mark slots with STACK_MISC in case of raw mode, stack offset
* is inferred from register state.
*/
@@ -2417,8 +2532,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
- struct bpf_insn_aux_data *insn_aux;
-
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -2434,21 +2547,36 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].id = ++env->id_gen;
- insn_aux = &env->insn_aux_data[insn_idx];
- if (!insn_aux->map_ptr)
- insn_aux->map_ptr = meta.map_ptr;
- else if (insn_aux->map_ptr != meta.map_ptr)
- insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
+ do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+
err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
return err;
+ if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+ const char *err_str;
+
+#ifdef CONFIG_PERF_EVENTS
+ err = get_callchain_buffers(sysctl_perf_event_max_stack);
+ err_str = "cannot get callchain buffer for func %s#%d\n";
+#else
+ err = -ENOTSUPP;
+ err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
+#endif
+ if (err) {
+ verbose(env, err_str, func_id_name(func_id), func_id);
+ return err;
+ }
+
+ env->prog->has_callchain_buf = true;
+ }
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
@@ -2893,10 +3021,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
dst_reg->umin_value <<= umin_val;
dst_reg->umax_value <<= umax_val;
}
- if (src_known)
- dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
- else
- dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
break;
@@ -2924,16 +3049,35 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
*/
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- if (src_known)
- dst_reg->var_off = tnum_rshift(dst_reg->var_off,
- umin_val);
- else
- dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+ dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
dst_reg->umin_value >>= umax_val;
dst_reg->umax_value >>= umin_val;
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
break;
+ case BPF_ARSH:
+ if (umax_val >= insn_bitness) {
+ /* Shifts greater than 31 or 63 are undefined.
+ * This includes shifts by a negative number.
+ */
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ break;
+ }
+
+ /* Upon reaching here, src_known is true and
+ * umax_val is equal to umin_val.
+ */
+ dst_reg->smin_value >>= umin_val;
+ dst_reg->smax_value >>= umin_val;
+ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+
+ /* blow away the dst_reg umin_value/umax_value and rely on
+ * dst_reg var_off to refine the result.
+ */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ __update_reg_bounds(dst_reg);
+ break;
default:
mark_reg_unknown(env, regs, insn->dst_reg);
break;
@@ -3817,7 +3961,12 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (env->subprog_cnt) {
+ if (!env->ops->gen_ld_abs) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ if (env->subprog_cnt > 1) {
/* when program has LD_ABS insn JITs and interpreter assume
* that r1 == ctx == skb which is not the case for callees
* that can have arbitrary arguments. It's problematic
@@ -3876,6 +4025,7 @@ static int check_return_code(struct bpf_verifier_env *env)
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_DEVICE:
break;
@@ -4601,10 +4751,11 @@ static int do_check(struct bpf_verifier_env *env)
if (env->log.level) {
const struct bpf_insn_cbs cbs = {
.cb_print = verbose,
+ .private_data = env,
};
verbose(env, "%d: ", insn_idx);
- print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks);
+ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
if (bpf_prog_is_dev_bound(env->prog->aux)) {
@@ -4846,15 +4997,15 @@ process_bpf_exit:
verbose(env, "processed %d insns (limit %d), stack depth ",
insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
- for (i = 0; i < env->subprog_cnt + 1; i++) {
- u32 depth = env->subprog_stack_depth[i];
+ for (i = 0; i < env->subprog_cnt; i++) {
+ u32 depth = env->subprog_info[i].stack_depth;
verbose(env, "%d", depth);
- if (i + 1 < env->subprog_cnt + 1)
+ if (i + 1 < env->subprog_cnt)
verbose(env, "+");
}
verbose(env, "\n");
- env->prog->aux->stack_depth = env->subprog_stack_depth[0];
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return 0;
}
@@ -4978,7 +5129,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
- * and all maps are released in free_bpf_prog_info()
+ * and all maps are released in free_used_maps()
*/
map = bpf_map_inc(map, false);
if (IS_ERR(map)) {
@@ -5060,10 +5211,11 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
if (len == 1)
return;
- for (i = 0; i < env->subprog_cnt; i++) {
- if (env->subprog_starts[i] < off)
+ /* NOTE: fake 'exit' subprog should be updated as well. */
+ for (i = 0; i <= env->subprog_cnt; i++) {
+ if (env->subprog_info[i].start < off)
continue;
- env->subprog_starts[i] += len - 1;
+ env->subprog_info[i].start += len - 1;
}
}
@@ -5137,7 +5289,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
}
- if (!ops->convert_ctx_access)
+ if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux))
return 0;
insn = env->prog->insnsi + delta;
@@ -5156,6 +5308,34 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
else
continue;
+ if (type == BPF_WRITE &&
+ env->insn_aux_data[i + delta].sanitize_stack_off) {
+ struct bpf_insn patch[] = {
+ /* Sanitize suspicious stack slot with zero.
+ * There are no memory dependencies for this store,
+ * since it's only using frame pointer and immediate
+ * constant of zero
+ */
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+ env->insn_aux_data[i + delta].sanitize_stack_off,
+ 0),
+ /* the original STX instruction will immediately
+ * overwrite the same stack slot with appropriate value
+ */
+ *insn,
+ };
+
+ cnt = ARRAY_SIZE(patch);
+ new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
@@ -5169,6 +5349,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
*/
is_narrower_load = size < ctx_field_size;
if (is_narrower_load) {
+ u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
u32 off = insn->off;
u8 size_code;
@@ -5183,7 +5364,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
else if (ctx_field_size == 8)
size_code = BPF_DW;
- insn->off = off & ~(ctx_field_size - 1);
+ insn->off = off & ~(size_default - 1);
insn->code = BPF_LDX | BPF_MEM | size_code;
}
@@ -5227,7 +5408,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
void *old_bpf_func;
int err = -ENOMEM;
- if (env->subprog_cnt == 0)
+ if (env->subprog_cnt <= 1)
return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5243,7 +5424,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
/* temporarily remember subprog id inside insn instead of
* aux_data, since next loop will split up all insns into funcs
*/
- insn->off = subprog + 1;
+ insn->off = subprog;
/* remember original imm in case JIT fails and fallback
* to interpreter will be needed
*/
@@ -5252,16 +5433,13 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn->imm = 1;
}
- func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
+ func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL);
if (!func)
return -ENOMEM;
- for (i = 0; i <= env->subprog_cnt; i++) {
+ for (i = 0; i < env->subprog_cnt; i++) {
subprog_start = subprog_end;
- if (env->subprog_cnt == i)
- subprog_end = prog->len;
- else
- subprog_end = env->subprog_starts[i];
+ subprog_end = env->subprog_info[i + 1].start;
len = subprog_end - subprog_start;
func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
@@ -5278,7 +5456,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* Long term would need debug info to populate names
*/
func[i]->aux->name[0] = 'F';
- func[i]->aux->stack_depth = env->subprog_stack_depth[i];
+ func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
@@ -5291,20 +5469,33 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* now populate all bpf_calls with correct addresses and
* run last pass of JIT
*/
- for (i = 0; i <= env->subprog_cnt; i++) {
+ for (i = 0; i < env->subprog_cnt; i++) {
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
continue;
subprog = insn->off;
- insn->off = 0;
insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
func[subprog]->bpf_func -
__bpf_call_base;
}
+
+ /* we use the aux data to keep a list of the start addresses
+ * of the JITed images for each function in the program
+ *
+ * for some architectures, such as powerpc64, the imm field
+ * might not be large enough to hold the offset of the start
+ * address of the callee's JITed image from __bpf_call_base
+ *
+ * in such cases, we can lookup the start address of a callee
+ * by using its subprog id, available from the off field of
+ * the call instruction, as an index for this list
+ */
+ func[i]->aux->func = func;
+ func[i]->aux->func_cnt = env->subprog_cnt;
}
- for (i = 0; i <= env->subprog_cnt; i++) {
+ for (i = 0; i < env->subprog_cnt; i++) {
old_bpf_func = func[i]->bpf_func;
tmp = bpf_int_jit_compile(func[i]);
if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
@@ -5318,7 +5509,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
/* finally lock prog and jit images for all functions and
* populate kallsysm
*/
- for (i = 0; i <= env->subprog_cnt; i++) {
+ for (i = 0; i < env->subprog_cnt; i++) {
bpf_prog_lock_ro(func[i]);
bpf_prog_kallsyms_add(func[i]);
}
@@ -5328,26 +5519,21 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* later look the same as if they were interpreted only.
*/
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- unsigned long addr;
-
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
continue;
insn->off = env->insn_aux_data[i].call_imm;
subprog = find_subprog(env, i + insn->off + 1);
- addr = (unsigned long)func[subprog + 1]->bpf_func;
- addr &= PAGE_MASK;
- insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
- addr - __bpf_call_base;
+ insn->imm = subprog;
}
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
prog->aux->func = func;
- prog->aux->func_cnt = env->subprog_cnt + 1;
+ prog->aux->func_cnt = env->subprog_cnt;
return 0;
out_free:
- for (i = 0; i <= env->subprog_cnt; i++)
+ for (i = 0; i < env->subprog_cnt; i++)
if (func[i])
bpf_jit_free(func[i]);
kfree(func);
@@ -5404,6 +5590,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
+ const struct bpf_map_ops *ops;
+ struct bpf_insn_aux_data *aux;
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
@@ -5450,6 +5638,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ if (BPF_CLASS(insn->code) == BPF_LD &&
+ (BPF_MODE(insn->code) == BPF_ABS ||
+ BPF_MODE(insn->code) == BPF_IND)) {
+ cnt = env->ops->gen_ld_abs(insn, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (insn->code != (BPF_JMP | BPF_CALL))
continue;
if (insn->src_reg == BPF_PSEUDO_CALL)
@@ -5478,19 +5685,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->imm = 0;
insn->code = BPF_JMP | BPF_TAIL_CALL;
+ aux = &env->insn_aux_data[i + delta];
+ if (!bpf_map_ptr_unpriv(aux))
+ continue;
+
/* instead of changing every JIT dealing with tail_call
* emit two extra insns:
* if (index >= max_entries) goto out;
* index &= array->index_mask;
* to avoid out-of-bounds cpu speculation
*/
- map_ptr = env->insn_aux_data[i + delta].map_ptr;
- if (map_ptr == BPF_MAP_PTR_POISON) {
+ if (bpf_map_ptr_poisoned(aux)) {
verbose(env, "tail_call abusing map_ptr\n");
return -EINVAL;
}
- if (!map_ptr->unpriv_array)
- continue;
+
+ map_ptr = BPF_MAP_PTR(aux->map_state);
insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
map_ptr->max_entries, 2);
insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -5510,32 +5720,61 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
}
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
- * handlers are currently limited to 64 bit only.
+ * and other inlining handlers are currently limited to 64 bit
+ * only.
*/
if (prog->jit_requested && BITS_PER_LONG == 64 &&
- insn->imm == BPF_FUNC_map_lookup_elem) {
- map_ptr = env->insn_aux_data[i + delta].map_ptr;
- if (map_ptr == BPF_MAP_PTR_POISON ||
- !map_ptr->ops->map_gen_lookup)
+ (insn->imm == BPF_FUNC_map_lookup_elem ||
+ insn->imm == BPF_FUNC_map_update_elem ||
+ insn->imm == BPF_FUNC_map_delete_elem)) {
+ aux = &env->insn_aux_data[i + delta];
+ if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
- cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
- }
+ map_ptr = BPF_MAP_PTR(aux->map_state);
+ ops = map_ptr->ops;
+ if (insn->imm == BPF_FUNC_map_lookup_elem &&
+ ops->map_gen_lookup) {
+ cnt = ops->map_gen_lookup(map_ptr, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
- cnt);
- if (!new_prog)
- return -ENOMEM;
+ new_prog = bpf_patch_insn_data(env, i + delta,
+ insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
- delta += cnt - 1;
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
- /* keep walking new program and skip insns we just inserted */
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- continue;
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
+ (int (*)(struct bpf_map *map, void *key))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_update_elem,
+ (int (*)(struct bpf_map *map, void *key, void *value,
+ u64 flags))NULL));
+ switch (insn->imm) {
+ case BPF_FUNC_map_lookup_elem:
+ insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
+ __bpf_call_base;
+ continue;
+ case BPF_FUNC_map_update_elem:
+ insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
+ __bpf_call_base;
+ continue;
+ case BPF_FUNC_map_delete_elem:
+ insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
+ __bpf_call_base;
+ continue;
+ }
+
+ goto patch_call_imm;
}
if (insn->imm == BPF_FUNC_redirect_map) {
@@ -5560,7 +5799,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn = new_prog->insnsi + i + delta;
}
patch_call_imm:
- fn = env->ops->get_func_proto(insn->imm);
+ fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
*/
@@ -5602,7 +5841,7 @@ static void free_states(struct bpf_verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
struct bpf_verifier_env *env;
- struct bpf_verifer_log *log;
+ struct bpf_verifier_log *log;
int ret = -EINVAL;
/* no program is valid */
@@ -5647,16 +5886,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
+ ret = replace_map_fd_with_map_ptr(env);
+ if (ret < 0)
+ goto skip_full_check;
+
if (bpf_prog_is_dev_bound(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env);
if (ret)
- goto err_unlock;
+ goto skip_full_check;
}
- ret = replace_map_fd_with_map_ptr(env);
- if (ret < 0)
- goto skip_full_check;
-
env->explored_states = kcalloc(env->prog->len,
sizeof(struct bpf_verifier_state_list *),
GFP_USER);
@@ -5727,7 +5966,7 @@ skip_full_check:
err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
- * them now. Otherwise free_bpf_prog_info() will release them.
+ * them now. Otherwise free_used_maps() will release them.
*/
release_maps(env);
*prog = env->prog;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
new file mode 100644
index 000000000000..b3c557476a8d
--- /dev/null
+++ b/kernel/bpf/xskmap.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+struct xsk_map {
+ struct bpf_map map;
+ struct xdp_sock **xsk_map;
+ struct list_head __percpu *flush_list;
+};
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+ int cpu, err = -EINVAL;
+ struct xsk_map *m;
+ u64 cost;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 ||
+ attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+ return ERR_PTR(-EINVAL);
+
+ m = kzalloc(sizeof(*m), GFP_USER);
+ if (!m)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&m->map, attr);
+
+ cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
+ cost += sizeof(struct list_head) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_m;
+
+ m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* Notice returns -EPERM on if map size is larger than memlock limit */
+ err = bpf_map_precharge_memlock(m->map.pages);
+ if (err)
+ goto free_m;
+
+ err = -ENOMEM;
+
+ m->flush_list = alloc_percpu(struct list_head);
+ if (!m->flush_list)
+ goto free_m;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
+
+ m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
+ sizeof(struct xdp_sock *),
+ m->map.numa_node);
+ if (!m->xsk_map)
+ goto free_percpu;
+ return &m->map;
+
+free_percpu:
+ free_percpu(m->flush_list);
+free_m:
+ kfree(m);
+ return ERR_PTR(err);
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ int i;
+
+ synchronize_net();
+
+ for (i = 0; i < map->max_entries; i++) {
+ struct xdp_sock *xs;
+
+ xs = m->xsk_map[i];
+ if (!xs)
+ continue;
+
+ sock_put((struct sock *)xs);
+ }
+
+ free_percpu(m->flush_list);
+ bpf_map_area_free(m->xsk_map);
+ kfree(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= m->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == m->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock *xs;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ xs = READ_ONCE(m->xsk_map[key]);
+ return xs;
+}
+
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+ struct xdp_sock *xs)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+ int err;
+
+ err = xsk_rcv(xs, xdp);
+ if (err)
+ return err;
+
+ if (!xs->flush_node.prev)
+ list_add(&xs->flush_node, flush_list);
+
+ return 0;
+}
+
+void __xsk_map_flush(struct bpf_map *map)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+ struct xdp_sock *xs, *tmp;
+
+ list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
+ xsk_flush(xs);
+ __list_del(xs->flush_node.prev, xs->flush_node.next);
+ xs->flush_node.prev = NULL;
+ }
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ u32 i = *(u32 *)key, fd = *(u32 *)value;
+ struct xdp_sock *xs, *old_xs;
+ struct socket *sock;
+ int err;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(i >= m->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock)
+ return err;
+
+ if (sock->sk->sk_family != PF_XDP) {
+ sockfd_put(sock);
+ return -EOPNOTSUPP;
+ }
+
+ xs = (struct xdp_sock *)sock->sk;
+
+ if (!xsk_is_setup_for_bpf_map(xs)) {
+ sockfd_put(sock);
+ return -EOPNOTSUPP;
+ }
+
+ sock_hold(sock->sk);
+
+ old_xs = xchg(&m->xsk_map[i], xs);
+ if (old_xs) {
+ /* Make sure we've flushed everything. */
+ synchronize_net();
+ sock_put((struct sock *)old_xs);
+ }
+
+ sockfd_put(sock);
+ return 0;
+}
+
+static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock *old_xs;
+ int k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ old_xs = xchg(&m->xsk_map[k], NULL);
+ if (old_xs) {
+ /* Make sure we've flushed everything. */
+ synchronize_net();
+ sock_put((struct sock *)old_xs);
+ }
+
+ return 0;
+}
+
+const struct bpf_map_ops xsk_map_ops = {
+ .map_alloc = xsk_map_alloc,
+ .map_free = xsk_map_free,
+ .map_get_next_key = xsk_map_get_next_key,
+ .map_lookup_elem = xsk_map_lookup_elem,
+ .map_update_elem = xsk_map_update_elem,
+ .map_delete_elem = xsk_map_delete_elem,
+};
+
+
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 2be89a003185..bfcdae896122 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
+obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index b928b27050c6..77ff1cd6a252 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -201,13 +201,12 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
int cgroup_task_count(const struct cgroup *cgrp);
/*
- * stat.c
+ * rstat.c
*/
-void cgroup_stat_flush(struct cgroup *cgrp);
-int cgroup_stat_init(struct cgroup *cgrp);
-void cgroup_stat_exit(struct cgroup *cgrp);
-void cgroup_stat_show_cputime(struct seq_file *seq);
-void cgroup_stat_boot(void);
+int cgroup_rstat_init(struct cgroup *cgrp);
+void cgroup_rstat_exit(struct cgroup *cgrp);
+void cgroup_rstat_boot(void);
+void cgroup_base_stat_cputime_show(struct seq_file *seq);
/*
* namespace.c
@@ -218,9 +217,9 @@ extern const struct proc_ns_operations cgroupns_operations;
* cgroup-v1.c
*/
extern struct cftype cgroup1_base_files[];
-extern const struct file_operations proc_cgroupstats_operations;
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+int proc_cgroupstats_show(struct seq_file *m, void *v);
bool cgroup1_ssid_disabled(int ssid);
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
void cgroup1_release_agent(struct work_struct *work);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index a2c05d2476ac..e06c97f3ed1a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -682,7 +682,7 @@ struct cftype cgroup1_base_files[] = {
};
/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
+int proc_cgroupstats_show(struct seq_file *m, void *v)
{
struct cgroup_subsys *ss;
int i;
@@ -705,18 +705,6 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
return 0;
}
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-const struct file_operations proc_cgroupstats_operations = {
- .open = cgroupstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/**
* cgroupstats_build - build and fill cgroupstats
* @stats: cgroupstats to fill information into
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8cda3bc3ae22..077370bf8964 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -54,6 +54,7 @@
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
+#include <linux/sched/cputime.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -61,6 +62,8 @@
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
+/* let's not notify more than 100 times per second */
+#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
@@ -142,14 +145,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
-static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
/*
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
*/
-struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
+struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
@@ -1554,6 +1557,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
spin_lock_irq(&cgroup_file_kn_lock);
cfile->kn = NULL;
spin_unlock_irq(&cgroup_file_kn_lock);
+
+ del_timer_sync(&cfile->notify_timer);
}
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
@@ -1573,8 +1578,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
- list_for_each_entry(cfts, &css->ss->cfts, node)
+ if (!css->ss) {
+ if (cgroup_on_dfl(cgrp))
+ cfts = cgroup_base_files;
+ else
+ cfts = cgroup1_base_files;
+
cgroup_addrm_files(css, cgrp, cfts, false);
+ } else {
+ list_for_each_entry(cfts, &css->ss->cfts, node)
+ cgroup_addrm_files(css, cgrp, cfts, false);
+ }
}
/**
@@ -1598,14 +1612,16 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
else
cfts = cgroup1_base_files;
- return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- }
-
- list_for_each_entry(cfts, &css->ss->cfts, node) {
- ret = cgroup_addrm_files(css, cgrp, cfts, true);
- if (ret < 0) {
- failed_cfts = cfts;
- goto err;
+ ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+ if (ret < 0)
+ return ret;
+ } else {
+ list_for_each_entry(cfts, &css->ss->cfts, node) {
+ ret = cgroup_addrm_files(css, cgrp, cfts, true);
+ if (ret < 0) {
+ failed_cfts = cfts;
+ goto err;
+ }
}
}
@@ -1782,13 +1798,6 @@ static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
- spin_lock_irq(&css_set_lock);
-
- if (use_task_css_set_links)
- goto out_unlock;
-
- use_task_css_set_links = true;
-
/*
* We need tasklist_lock because RCU is not safe against
* while_each_thread(). Besides, a forking task that has passed
@@ -1797,6 +1806,13 @@ static void cgroup_enable_task_cg_lists(void)
* tasklist if we walk through it with RCU.
*/
read_lock(&tasklist_lock);
+ spin_lock_irq(&css_set_lock);
+
+ if (use_task_css_set_links)
+ goto out_unlock;
+
+ use_task_css_set_links = true;
+
do_each_thread(g, p) {
WARN_ON_ONCE(!list_empty(&p->cg_list) ||
task_css_set(p) != &init_css_set);
@@ -1824,9 +1840,9 @@ static void cgroup_enable_task_cg_lists(void)
}
spin_unlock(&p->sighand->siglock);
} while_each_thread(g, p);
- read_unlock(&tasklist_lock);
out_unlock:
spin_unlock_irq(&css_set_lock);
+ read_unlock(&tasklist_lock);
}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1844,6 +1860,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
cgrp->dom_cgrp = cgrp;
cgrp->max_descendants = INT_MAX;
cgrp->max_depth = INT_MAX;
+ INIT_LIST_HEAD(&cgrp->rstat_css_list);
+ prev_cputime_init(&cgrp->prev_cputime);
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -3183,6 +3201,16 @@ static int cgroup_enable_threaded(struct cgroup *cgrp)
if (cgroup_is_threaded(cgrp))
return 0;
+ /*
+ * If @cgroup is populated or has domain controllers enabled, it
+ * can't be switched. While the below cgroup_can_be_thread_root()
+ * test can catch the same conditions, that's only when @parent is
+ * not mixable, so let's check it explicitly.
+ */
+ if (cgroup_is_populated(cgrp) ||
+ cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+ return -EOPNOTSUPP;
+
/* we're joining the parent's domain, ensure its validity */
if (!cgroup_is_valid_domain(dom_cgrp) ||
!cgroup_can_be_thread_root(dom_cgrp))
@@ -3371,7 +3399,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
int ret = 0;
- cgroup_stat_show_cputime(seq);
+ cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
@@ -3511,6 +3539,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
return kernfs_setattr(kn, &iattr);
}
+static void cgroup_file_notify_timer(struct timer_list *timer)
+{
+ cgroup_file_notify(container_of(timer, struct cgroup_file,
+ notify_timer));
+}
+
static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
struct cftype *cft)
{
@@ -3537,6 +3571,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
if (cft->file_offset) {
struct cgroup_file *cfile = (void *)css + cft->file_offset;
+ timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
+
spin_lock_irq(&cgroup_file_kn_lock);
cfile->kn = kn;
spin_unlock_irq(&cgroup_file_kn_lock);
@@ -3786,8 +3822,17 @@ void cgroup_file_notify(struct cgroup_file *cfile)
unsigned long flags;
spin_lock_irqsave(&cgroup_file_kn_lock, flags);
- if (cfile->kn)
- kernfs_notify(cfile->kn);
+ if (cfile->kn) {
+ unsigned long last = cfile->notified_at;
+ unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+
+ if (time_in_range(jiffies, last, next)) {
+ timer_reduce(&cfile->notify_timer, next);
+ } else {
+ kernfs_notify(cfile->kn);
+ cfile->notified_at = jiffies;
+ }
+ }
spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
@@ -4514,10 +4559,10 @@ static struct cftype cgroup_base_files[] = {
* and thus involve punting to css->destroy_work adding two additional
* steps to the already complex sequence.
*/
-static void css_free_work_fn(struct work_struct *work)
+static void css_free_rwork_fn(struct work_struct *work)
{
- struct cgroup_subsys_state *css =
- container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
+ struct cgroup_subsys_state, destroy_rwork);
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
@@ -4550,7 +4595,7 @@ static void css_free_work_fn(struct work_struct *work)
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
if (cgroup_on_dfl(cgrp))
- cgroup_stat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -4563,15 +4608,6 @@ static void css_free_work_fn(struct work_struct *work)
}
}
-static void css_free_rcu_fn(struct rcu_head *rcu_head)
-{
- struct cgroup_subsys_state *css =
- container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
-
- INIT_WORK(&css->destroy_work, css_free_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
-}
-
static void css_release_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
@@ -4586,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work)
if (ss) {
/* css release path */
+ if (!list_empty(&css->rstat_css_node)) {
+ cgroup_rstat_flush(cgrp);
+ list_del_rcu(&css->rstat_css_node);
+ }
+
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
@@ -4596,7 +4637,7 @@ static void css_release_work_fn(struct work_struct *work)
trace_cgroup_release(cgrp);
if (cgroup_on_dfl(cgrp))
- cgroup_stat_flush(cgrp);
+ cgroup_rstat_flush(cgrp);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
tcgrp = cgroup_parent(tcgrp))
@@ -4621,7 +4662,8 @@ static void css_release_work_fn(struct work_struct *work)
mutex_unlock(&cgroup_mutex);
- call_rcu(&css->rcu_head, css_free_rcu_fn);
+ INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
+ queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
@@ -4646,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
+ INIT_LIST_HEAD(&css->rstat_css_node);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
@@ -4654,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css_get(css->parent);
}
+ if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
+ list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
+
BUG_ON(cgroup_css(cgrp, ss));
}
@@ -4755,7 +4801,9 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
- call_rcu(&css->rcu_head, css_free_rcu_fn);
+ list_del_rcu(&css->rstat_css_node);
+ INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
+ queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
@@ -4772,8 +4820,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(sizeof(*cgrp) +
- sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
+ GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -4782,7 +4830,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
goto out_free_cgrp;
if (cgroup_on_dfl(parent)) {
- ret = cgroup_stat_init(cgrp);
+ ret = cgroup_rstat_init(cgrp);
if (ret)
goto out_cancel_ref;
}
@@ -4847,7 +4895,7 @@ out_idr_free:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_stat_exit:
if (cgroup_on_dfl(parent))
- cgroup_stat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5087,10 +5135,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
for_each_css(css, ssid, cgrp)
kill_css(css);
- /*
- * Remove @cgrp directory along with the base files. @cgrp has an
- * extra ref on its kn.
- */
+ /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
+ css_clear_dir(&cgrp->self);
kernfs_remove(cgrp->kn);
if (parent && cgroup_is_threaded(cgrp))
@@ -5242,7 +5288,7 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
- cgroup_stat_boot();
+ cgroup_rstat_boot();
/*
* The latency of the synchronize_sched() is too high for cgroups,
@@ -5332,7 +5378,7 @@ int __init cgroup_init(void)
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
- WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
+ WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
return 0;
}
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index defad3c5e7dc..d3bbb757ee49 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -362,35 +362,32 @@ EXPORT_SYMBOL(rdmacg_unregister_device);
static int parse_resource(char *c, int *intval)
{
substring_t argstr;
- const char **table = &rdmacg_resource_names[0];
char *name, *value = c;
size_t len;
- int ret, i = 0;
+ int ret, i;
name = strsep(&value, "=");
if (!name || !value)
return -EINVAL;
- len = strlen(value);
+ i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
+ if (i < 0)
+ return i;
- for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
- if (strcmp(table[i], name))
- continue;
+ len = strlen(value);
- argstr.from = value;
- argstr.to = value + len;
+ argstr.from = value;
+ argstr.to = value + len;
- ret = match_int(&argstr, intval);
- if (ret >= 0) {
- if (*intval < 0)
- break;
- return i;
- }
- if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
- *intval = S32_MAX;
- return i;
- }
- break;
+ ret = match_int(&argstr, intval);
+ if (ret >= 0) {
+ if (*intval < 0)
+ return -EINVAL;
+ return i;
+ }
+ if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+ *intval = S32_MAX;
+ return i;
}
return -EINVAL;
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
new file mode 100644
index 000000000000..d503d1a9007c
--- /dev/null
+++ b/kernel/cgroup/rstat.c
@@ -0,0 +1,416 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_SPINLOCK(cgroup_rstat_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
+
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
+
+static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
+{
+ return per_cpu_ptr(cgrp->rstat_cpu, cpu);
+}
+
+/**
+ * cgroup_rstat_updated - keep track of updated rstat_cpu
+ * @cgrp: target cgroup
+ * @cpu: cpu on which rstat_cpu was updated
+ *
+ * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
+ * rstat_cpu->updated_children list. See the comment on top of
+ * cgroup_rstat_cpu definition for details.
+ */
+void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+{
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+ struct cgroup *parent;
+ unsigned long flags;
+
+ /* nothing to do for root */
+ if (!cgroup_parent(cgrp))
+ return;
+
+ /*
+ * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we
+ * see NULL updated_next or they see our updated stat.
+ */
+ smp_mb();
+
+ /*
+ * Because @parent's updated_children is terminated with @parent
+ * instead of NULL, we can tell whether @cgrp is on the list by
+ * testing the next pointer for NULL.
+ */
+ if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+ return;
+
+ raw_spin_lock_irqsave(cpu_lock, flags);
+
+ /* put @cgrp and all ancestors on the corresponding updated lists */
+ for (parent = cgroup_parent(cgrp); parent;
+ cgrp = parent, parent = cgroup_parent(cgrp)) {
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+
+ /*
+ * Both additions and removals are bottom-up. If a cgroup
+ * is already in the tree, all ancestors are.
+ */
+ if (rstatc->updated_next)
+ break;
+
+ rstatc->updated_next = prstatc->updated_children;
+ prstatc->updated_children = cgrp;
+ }
+
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
+
+/**
+ * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts
+ * the traversal and %NULL return indicates the end. During traversal,
+ * each returned cgroup is unlinked from the tree. Must be called with the
+ * matching cgroup_rstat_cpu_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
+ struct cgroup *root, int cpu)
+{
+ struct cgroup_rstat_cpu *rstatc;
+ struct cgroup *parent;
+
+ if (pos == root)
+ return NULL;
+
+ /*
+ * We're gonna walk down to the first leaf and visit/remove it. We
+ * can pick whatever unvisited node as the starting point.
+ */
+ if (!pos)
+ pos = root;
+ else
+ pos = cgroup_parent(pos);
+
+ /* walk down to the first leaf */
+ while (true) {
+ rstatc = cgroup_rstat_cpu(pos, cpu);
+ if (rstatc->updated_children == pos)
+ break;
+ pos = rstatc->updated_children;
+ }
+
+ /*
+ * Unlink @pos from the tree. As the updated_children list is
+ * singly linked, we have to walk it to find the removal point.
+ * However, due to the way we traverse, @pos will be the first
+ * child in most cases. The only exception is @root.
+ */
+ parent = cgroup_parent(pos);
+ if (parent && rstatc->updated_next) {
+ struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+ struct cgroup_rstat_cpu *nrstatc;
+ struct cgroup **nextp;
+
+ nextp = &prstatc->updated_children;
+ while (true) {
+ nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ if (*nextp == pos)
+ break;
+
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &nrstatc->updated_next;
+ }
+
+ *nextp = rstatc->updated_next;
+ rstatc->updated_next = NULL;
+
+ /*
+ * Paired with the one in cgroup_rstat_cpu_updated().
+ * Either they see NULL updated_next or we see their
+ * updated stat.
+ */
+ smp_mb();
+ }
+
+ return pos;
+}
+
+/* see cgroup_rstat_flush() */
+static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
+ __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
+{
+ int cpu;
+
+ lockdep_assert_held(&cgroup_rstat_lock);
+
+ for_each_possible_cpu(cpu) {
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
+ cpu);
+ struct cgroup *pos = NULL;
+
+ raw_spin_lock(cpu_lock);
+ while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+ struct cgroup_subsys_state *css;
+
+ cgroup_base_stat_flush(pos, cpu);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(css, &pos->rstat_css_list,
+ rstat_css_node)
+ css->ss->css_rstat_flush(css, cpu);
+ rcu_read_unlock();
+ }
+ raw_spin_unlock(cpu_lock);
+
+ /* if @may_sleep, play nice and yield if necessary */
+ if (may_sleep && (need_resched() ||
+ spin_needbreak(&cgroup_rstat_lock))) {
+ spin_unlock_irq(&cgroup_rstat_lock);
+ if (!cond_resched())
+ cpu_relax();
+ spin_lock_irq(&cgroup_rstat_lock);
+ }
+ }
+}
+
+/**
+ * cgroup_rstat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards. After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ *
+ * This function may block.
+ */
+void cgroup_rstat_flush(struct cgroup *cgrp)
+{
+ might_sleep();
+
+ spin_lock_irq(&cgroup_rstat_lock);
+ cgroup_rstat_flush_locked(cgrp, true);
+ spin_unlock_irq(&cgroup_rstat_lock);
+}
+
+/**
+ * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
+ * @cgrp: target cgroup
+ *
+ * This function can be called from any context.
+ */
+void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cgroup_rstat_lock, flags);
+ cgroup_rstat_flush_locked(cgrp, false);
+ spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
+}
+
+/**
+ * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * @cgrp: target cgroup
+ *
+ * Flush stats in @cgrp's subtree and prevent further flushes. Must be
+ * paired with cgroup_rstat_flush_release().
+ *
+ * This function may block.
+ */
+void cgroup_rstat_flush_hold(struct cgroup *cgrp)
+ __acquires(&cgroup_rstat_lock)
+{
+ might_sleep();
+ spin_lock_irq(&cgroup_rstat_lock);
+ cgroup_rstat_flush_locked(cgrp, true);
+}
+
+/**
+ * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
+ */
+void cgroup_rstat_flush_release(void)
+ __releases(&cgroup_rstat_lock)
+{
+ spin_unlock_irq(&cgroup_rstat_lock);
+}
+
+int cgroup_rstat_init(struct cgroup *cgrp)
+{
+ int cpu;
+
+ /* the root cgrp has rstat_cpu preallocated */
+ if (!cgrp->rstat_cpu) {
+ cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
+ if (!cgrp->rstat_cpu)
+ return -ENOMEM;
+ }
+
+ /* ->updated_children list is self terminated */
+ for_each_possible_cpu(cpu) {
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+
+ rstatc->updated_children = cgrp;
+ u64_stats_init(&rstatc->bsync);
+ }
+
+ return 0;
+}
+
+void cgroup_rstat_exit(struct cgroup *cgrp)
+{
+ int cpu;
+
+ cgroup_rstat_flush(cgrp);
+
+ /* sanity check */
+ for_each_possible_cpu(cpu) {
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+
+ if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
+ WARN_ON_ONCE(rstatc->updated_next))
+ return;
+ }
+
+ free_percpu(cgrp->rstat_cpu);
+ cgrp->rstat_cpu = NULL;
+}
+
+void __init cgroup_rstat_boot(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
+
+ BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
+}
+
+/*
+ * Functions for cgroup basic resource statistics implemented on top of
+ * rstat.
+ */
+static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat,
+ struct cgroup_base_stat *src_bstat)
+{
+ dst_bstat->cputime.utime += src_bstat->cputime.utime;
+ dst_bstat->cputime.stime += src_bstat->cputime.stime;
+ dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct task_cputime *last_cputime = &rstatc->last_bstat.cputime;
+ struct task_cputime cputime;
+ struct cgroup_base_stat delta;
+ unsigned seq;
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = __u64_stats_fetch_begin(&rstatc->bsync);
+ cputime = rstatc->bstat.cputime;
+ } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
+
+ /* calculate the delta to propgate */
+ delta.cputime.utime = cputime.utime - last_cputime->utime;
+ delta.cputime.stime = cputime.stime - last_cputime->stime;
+ delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+ last_cputime->sum_exec_runtime;
+ *last_cputime = cputime;
+
+ /* transfer the pending stat into delta */
+ cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat);
+ memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat));
+
+ /* propagate delta into the global stat and the parent's pending */
+ cgroup_base_stat_accumulate(&cgrp->bstat, &delta);
+ if (parent)
+ cgroup_base_stat_accumulate(&parent->pending_bstat, &delta);
+}
+
+static struct cgroup_rstat_cpu *
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+{
+ struct cgroup_rstat_cpu *rstatc;
+
+ rstatc = get_cpu_ptr(cgrp->rstat_cpu);
+ u64_stats_update_begin(&rstatc->bsync);
+ return rstatc;
+}
+
+static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
+ struct cgroup_rstat_cpu *rstatc)
+{
+ u64_stats_update_end(&rstatc->bsync);
+ cgroup_rstat_updated(cgrp, smp_processor_id());
+ put_cpu_ptr(rstatc);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+ struct cgroup_rstat_cpu *rstatc;
+
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+ enum cpu_usage_stat index, u64 delta_exec)
+{
+ struct cgroup_rstat_cpu *rstatc;
+
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+
+ switch (index) {
+ case CPUTIME_USER:
+ case CPUTIME_NICE:
+ rstatc->bstat.cputime.utime += delta_exec;
+ break;
+ case CPUTIME_SYSTEM:
+ case CPUTIME_IRQ:
+ case CPUTIME_SOFTIRQ:
+ rstatc->bstat.cputime.stime += delta_exec;
+ break;
+ default:
+ break;
+ }
+
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+}
+
+void cgroup_base_stat_cputime_show(struct seq_file *seq)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ u64 usage, utime, stime;
+
+ if (!cgroup_parent(cgrp))
+ return;
+
+ cgroup_rstat_flush_hold(cgrp);
+ usage = cgrp->bstat.cputime.sum_exec_runtime;
+ cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime);
+ cgroup_rstat_flush_release();
+
+ do_div(usage, NSEC_PER_USEC);
+ do_div(utime, NSEC_PER_USEC);
+ do_div(stime, NSEC_PER_USEC);
+
+ seq_printf(seq, "usage_usec %llu\n"
+ "user_usec %llu\n"
+ "system_usec %llu\n",
+ usage, utime, stime);
+}
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
deleted file mode 100644
index 1e111dd455c4..000000000000
--- a/kernel/cgroup/stat.c
+++ /dev/null
@@ -1,338 +0,0 @@
-#include "cgroup-internal.h"
-
-#include <linux/sched/cputime.h>
-
-static DEFINE_MUTEX(cgroup_stat_mutex);
-static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
-
-static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
-{
- return per_cpu_ptr(cgrp->cpu_stat, cpu);
-}
-
-/**
- * cgroup_cpu_stat_updated - keep track of updated cpu_stat
- * @cgrp: target cgroup
- * @cpu: cpu on which cpu_stat was updated
- *
- * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching
- * cpu_stat->updated_children list. See the comment on top of
- * cgroup_cpu_stat definition for details.
- */
-static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
-{
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
- struct cgroup *parent;
- unsigned long flags;
-
- /*
- * Speculative already-on-list test. This may race leading to
- * temporary inaccuracies, which is fine.
- *
- * Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @cgrp is on the list by
- * testing the next pointer for NULL.
- */
- if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
- return;
-
- raw_spin_lock_irqsave(cpu_lock, flags);
-
- /* put @cgrp and all ancestors on the corresponding updated lists */
- for (parent = cgroup_parent(cgrp); parent;
- cgrp = parent, parent = cgroup_parent(cgrp)) {
- struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
- struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
-
- /*
- * Both additions and removals are bottom-up. If a cgroup
- * is already in the tree, all ancestors are.
- */
- if (cstat->updated_next)
- break;
-
- cstat->updated_next = pcstat->updated_children;
- pcstat->updated_children = cgrp;
- }
-
- raw_spin_unlock_irqrestore(cpu_lock, flags);
-}
-
-/**
- * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
- * @pos: current position
- * @root: root of the tree to traversal
- * @cpu: target cpu
- *
- * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts
- * the traversal and %NULL return indicates the end. During traversal,
- * each returned cgroup is unlinked from the tree. Must be called with the
- * matching cgroup_cpu_stat_lock held.
- *
- * The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
- */
-static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
- struct cgroup *root, int cpu)
-{
- struct cgroup_cpu_stat *cstat;
- struct cgroup *parent;
-
- if (pos == root)
- return NULL;
-
- /*
- * We're gonna walk down to the first leaf and visit/remove it. We
- * can pick whatever unvisited node as the starting point.
- */
- if (!pos)
- pos = root;
- else
- pos = cgroup_parent(pos);
-
- /* walk down to the first leaf */
- while (true) {
- cstat = cgroup_cpu_stat(pos, cpu);
- if (cstat->updated_children == pos)
- break;
- pos = cstat->updated_children;
- }
-
- /*
- * Unlink @pos from the tree. As the updated_children list is
- * singly linked, we have to walk it to find the removal point.
- * However, due to the way we traverse, @pos will be the first
- * child in most cases. The only exception is @root.
- */
- parent = cgroup_parent(pos);
- if (parent && cstat->updated_next) {
- struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
- struct cgroup_cpu_stat *ncstat;
- struct cgroup **nextp;
-
- nextp = &pcstat->updated_children;
- while (true) {
- ncstat = cgroup_cpu_stat(*nextp, cpu);
- if (*nextp == pos)
- break;
-
- WARN_ON_ONCE(*nextp == parent);
- nextp = &ncstat->updated_next;
- }
-
- *nextp = cstat->updated_next;
- cstat->updated_next = NULL;
- }
-
- return pos;
-}
-
-static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
- struct cgroup_stat *src_stat)
-{
- dst_stat->cputime.utime += src_stat->cputime.utime;
- dst_stat->cputime.stime += src_stat->cputime.stime;
- dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
-}
-
-static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
-{
- struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
- struct task_cputime *last_cputime = &cstat->last_cputime;
- struct task_cputime cputime;
- struct cgroup_stat delta;
- unsigned seq;
-
- lockdep_assert_held(&cgroup_stat_mutex);
-
- /* fetch the current per-cpu values */
- do {
- seq = __u64_stats_fetch_begin(&cstat->sync);
- cputime = cstat->cputime;
- } while (__u64_stats_fetch_retry(&cstat->sync, seq));
-
- /* accumulate the deltas to propgate */
- delta.cputime.utime = cputime.utime - last_cputime->utime;
- delta.cputime.stime = cputime.stime - last_cputime->stime;
- delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
- last_cputime->sum_exec_runtime;
- *last_cputime = cputime;
-
- /* transfer the pending stat into delta */
- cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
- memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
-
- /* propagate delta into the global stat and the parent's pending */
- cgroup_stat_accumulate(&cgrp->stat, &delta);
- if (parent)
- cgroup_stat_accumulate(&parent->pending_stat, &delta);
-}
-
-/* see cgroup_stat_flush() */
-static void cgroup_stat_flush_locked(struct cgroup *cgrp)
-{
- int cpu;
-
- lockdep_assert_held(&cgroup_stat_mutex);
-
- for_each_possible_cpu(cpu) {
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
- struct cgroup *pos = NULL;
-
- raw_spin_lock_irq(cpu_lock);
- while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
- cgroup_cpu_stat_flush_one(pos, cpu);
- raw_spin_unlock_irq(cpu_lock);
- }
-}
-
-/**
- * cgroup_stat_flush - flush stats in @cgrp's subtree
- * @cgrp: target cgroup
- *
- * Collect all per-cpu stats in @cgrp's subtree into the global counters
- * and propagate them upwards. After this function returns, all cgroups in
- * the subtree have up-to-date ->stat.
- *
- * This also gets all cgroups in the subtree including @cgrp off the
- * ->updated_children lists.
- */
-void cgroup_stat_flush(struct cgroup *cgrp)
-{
- mutex_lock(&cgroup_stat_mutex);
- cgroup_stat_flush_locked(cgrp);
- mutex_unlock(&cgroup_stat_mutex);
-}
-
-static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
-{
- struct cgroup_cpu_stat *cstat;
-
- cstat = get_cpu_ptr(cgrp->cpu_stat);
- u64_stats_update_begin(&cstat->sync);
- return cstat;
-}
-
-static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
- struct cgroup_cpu_stat *cstat)
-{
- u64_stats_update_end(&cstat->sync);
- cgroup_cpu_stat_updated(cgrp, smp_processor_id());
- put_cpu_ptr(cstat);
-}
-
-void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
-{
- struct cgroup_cpu_stat *cstat;
-
- cstat = cgroup_cpu_stat_account_begin(cgrp);
- cstat->cputime.sum_exec_runtime += delta_exec;
- cgroup_cpu_stat_account_end(cgrp, cstat);
-}
-
-void __cgroup_account_cputime_field(struct cgroup *cgrp,
- enum cpu_usage_stat index, u64 delta_exec)
-{
- struct cgroup_cpu_stat *cstat;
-
- cstat = cgroup_cpu_stat_account_begin(cgrp);
-
- switch (index) {
- case CPUTIME_USER:
- case CPUTIME_NICE:
- cstat->cputime.utime += delta_exec;
- break;
- case CPUTIME_SYSTEM:
- case CPUTIME_IRQ:
- case CPUTIME_SOFTIRQ:
- cstat->cputime.stime += delta_exec;
- break;
- default:
- break;
- }
-
- cgroup_cpu_stat_account_end(cgrp, cstat);
-}
-
-void cgroup_stat_show_cputime(struct seq_file *seq)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
- u64 usage, utime, stime;
-
- if (!cgroup_parent(cgrp))
- return;
-
- mutex_lock(&cgroup_stat_mutex);
-
- cgroup_stat_flush_locked(cgrp);
-
- usage = cgrp->stat.cputime.sum_exec_runtime;
- cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
- &utime, &stime);
-
- mutex_unlock(&cgroup_stat_mutex);
-
- do_div(usage, NSEC_PER_USEC);
- do_div(utime, NSEC_PER_USEC);
- do_div(stime, NSEC_PER_USEC);
-
- seq_printf(seq, "usage_usec %llu\n"
- "user_usec %llu\n"
- "system_usec %llu\n",
- usage, utime, stime);
-}
-
-int cgroup_stat_init(struct cgroup *cgrp)
-{
- int cpu;
-
- /* the root cgrp has cpu_stat preallocated */
- if (!cgrp->cpu_stat) {
- cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
- if (!cgrp->cpu_stat)
- return -ENOMEM;
- }
-
- /* ->updated_children list is self terminated */
- for_each_possible_cpu(cpu) {
- struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-
- cstat->updated_children = cgrp;
- u64_stats_init(&cstat->sync);
- }
-
- prev_cputime_init(&cgrp->stat.prev_cputime);
-
- return 0;
-}
-
-void cgroup_stat_exit(struct cgroup *cgrp)
-{
- int cpu;
-
- cgroup_stat_flush(cgrp);
-
- /* sanity check */
- for_each_possible_cpu(cpu) {
- struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-
- if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
- WARN_ON_ONCE(cstat->updated_next))
- return;
- }
-
- free_percpu(cgrp->cpu_stat);
- cgrp->cpu_stat = NULL;
-}
-
-void __init cgroup_stat_boot(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
-
- BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
-}
diff --git a/kernel/compat.c b/kernel/compat.c
index 3f5fa8902e7d..702aa846ddac 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -34,6 +34,7 @@ int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp)
{
struct compat_timex tx32;
+ memset(txc, 0, sizeof(struct timex));
if (copy_from_user(&tx32, utp, sizeof(struct compat_timex)))
return -EFAULT;
@@ -120,50 +121,6 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-static int __compat_get_timespec64(struct timespec64 *ts64,
- const struct compat_timespec __user *cts)
-{
- struct compat_timespec ts;
- int ret;
-
- ret = copy_from_user(&ts, cts, sizeof(ts));
- if (ret)
- return -EFAULT;
-
- ts64->tv_sec = ts.tv_sec;
- ts64->tv_nsec = ts.tv_nsec;
-
- return 0;
-}
-
-static int __compat_put_timespec64(const struct timespec64 *ts64,
- struct compat_timespec __user *cts)
-{
- struct compat_timespec ts = {
- .tv_sec = ts64->tv_sec,
- .tv_nsec = ts64->tv_nsec
- };
- return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
-}
-
-int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
-{
- if (COMPAT_USE_64BIT_TIME)
- return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
- else
- return __compat_get_timespec64(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_get_timespec64);
-
-int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
-{
- if (COMPAT_USE_64BIT_TIME)
- return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
- else
- return __compat_put_timespec64(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_put_timespec64);
-
int compat_get_timeval(struct timeval *tv, const void __user *utv)
{
if (COMPAT_USE_64BIT_TIME)
@@ -367,6 +324,14 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
return ret;
}
+/* Todo: Delete these extern declarations when get/put_compat_itimerspec64()
+ * are moved to kernel/time/time.c .
+ */
+extern int __compat_get_timespec64(struct timespec64 *ts64,
+ const struct compat_timespec __user *cts);
+extern int __compat_put_timespec64(const struct timespec64 *ts64,
+ struct compat_timespec __user *cts);
+
int get_compat_itimerspec64(struct itimerspec64 *its,
const struct compat_itimerspec __user *uits)
{
@@ -488,61 +453,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
}
EXPORT_SYMBOL_GPL(get_compat_sigset);
-#ifdef CONFIG_NUMA
-COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
- compat_uptr_t __user *, pages32,
- const int __user *, nodes,
- int __user *, status,
- int, flags)
-{
- const void __user * __user *pages;
- int i;
-
- pages = compat_alloc_user_space(nr_pages * sizeof(void *));
- for (i = 0; i < nr_pages; i++) {
- compat_uptr_t p;
-
- if (get_user(p, pages32 + i) ||
- put_user(compat_ptr(p), pages + i))
- return -EFAULT;
- }
- return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
-}
-
-COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
- compat_ulong_t, maxnode,
- const compat_ulong_t __user *, old_nodes,
- const compat_ulong_t __user *, new_nodes)
-{
- unsigned long __user *old = NULL;
- unsigned long __user *new = NULL;
- nodemask_t tmp_mask;
- unsigned long nr_bits;
- unsigned long size;
-
- nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
- size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
- if (old_nodes) {
- if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
- return -EFAULT;
- old = compat_alloc_user_space(new_nodes ? size * 2 : size);
- if (new_nodes)
- new = old + size / sizeof(unsigned long);
- if (copy_to_user(old, nodes_addr(tmp_mask), size))
- return -EFAULT;
- }
- if (new_nodes) {
- if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
- return -EFAULT;
- if (new == NULL)
- new = compat_alloc_user_space(size);
- if (copy_to_user(new, nodes_addr(tmp_mask), size))
- return -EFAULT;
- }
- return sys_migrate_pages(pid, nr_bits + 1, old, new);
-}
-#endif
-
/*
* Allocate user-space memory for the duration of a single system call,
* in order to marshall parameters inside a compat thunk.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53f7dc65f9a3..0db8938fbb23 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -124,24 +124,11 @@ struct cpuhp_step {
};
static DEFINE_MUTEX(cpuhp_state_mutex);
-static struct cpuhp_step cpuhp_bp_states[];
-static struct cpuhp_step cpuhp_ap_states[];
-
-static bool cpuhp_is_ap_state(enum cpuhp_state state)
-{
- /*
- * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
- * purposes as that state is handled explicitly in cpu_down.
- */
- return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
-}
+static struct cpuhp_step cpuhp_hp_states[];
static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
{
- struct cpuhp_step *sp;
-
- sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
- return sp + state;
+ return cpuhp_hp_states + state;
}
/**
@@ -239,6 +226,15 @@ err:
}
#ifdef CONFIG_SMP
+static bool cpuhp_is_ap_state(enum cpuhp_state state)
+{
+ /*
+ * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
+ * purposes as that state is handled explicitly in cpu_down.
+ */
+ return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
+}
+
static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
struct completion *done = bringup ? &st->done_up : &st->done_down;
@@ -1224,7 +1220,7 @@ int __boot_cpu_id;
#endif /* CONFIG_SMP */
/* Boot processor state steps */
-static struct cpuhp_step cpuhp_bp_states[] = {
+static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_OFFLINE] = {
.name = "offline",
.startup.single = NULL,
@@ -1289,24 +1285,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.teardown.single = NULL,
.cant_stop = true,
},
- /*
- * Handled on controll processor until the plugged processor manages
- * this itself.
- */
- [CPUHP_TEARDOWN_CPU] = {
- .name = "cpu:teardown",
- .startup.single = NULL,
- .teardown.single = takedown_cpu,
- .cant_stop = true,
- },
-#else
- [CPUHP_BRINGUP_CPU] = { },
-#endif
-};
-
-/* Application processor state steps */
-static struct cpuhp_step cpuhp_ap_states[] = {
-#ifdef CONFIG_SMP
/* Final state before CPU kills itself */
[CPUHP_AP_IDLE_DEAD] = {
.name = "idle:dead",
@@ -1340,6 +1318,16 @@ static struct cpuhp_step cpuhp_ap_states[] = {
[CPUHP_AP_ONLINE] = {
.name = "ap:online",
},
+ /*
+ * Handled on controll processor until the plugged processor manages
+ * this itself.
+ */
+ [CPUHP_TEARDOWN_CPU] = {
+ .name = "cpu:teardown",
+ .startup.single = NULL,
+ .teardown.single = takedown_cpu,
+ .cant_stop = true,
+ },
/* Handle smpboot threads park/unpark */
[CPUHP_AP_SMPBOOT_THREADS] = {
.name = "smpboot/threads:online",
@@ -1408,11 +1396,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
switch (state) {
case CPUHP_AP_ONLINE_DYN:
- step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN;
+ step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
end = CPUHP_AP_ONLINE_DYN_END;
break;
case CPUHP_BP_PREPARE_DYN:
- step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN;
+ step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
end = CPUHP_BP_PREPARE_DYN_END;
break;
default:
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4f63597c824d..f7674d676889 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -376,6 +376,7 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
return __pa(vmcoreinfo_note);
}
+EXPORT_SYMBOL(paddr_vmcoreinfo_note);
static int __init crash_save_vmcoreinfo_init(void)
{
@@ -453,6 +454,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_lru);
VMCOREINFO_NUMBER(PG_private);
VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_swapbacked);
VMCOREINFO_NUMBER(PG_slab);
#ifdef CONFIG_MEMORY_FAILURE
VMCOREINFO_NUMBER(PG_hwpoison);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 90ff129c88a2..62c301ad0773 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -242,11 +242,11 @@ static void kdb_printbp(kdb_bp_t *bp, int i)
kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
if (bp->bp_enabled)
- kdb_printf("\n is enabled");
+ kdb_printf("\n is enabled ");
else
kdb_printf("\n is disabled");
- kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+ kdb_printf(" addr at %016lx, hardtype=%d installed=%d\n",
bp->bp_addr, bp->bp_type, bp->bp_installed);
kdb_printf("\n");
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index dbb0781a0533..e405677ee08d 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1150,6 +1150,16 @@ void kdb_set_current_task(struct task_struct *p)
kdb_current_regs = NULL;
}
+static void drop_newline(char *buf)
+{
+ size_t len = strlen(buf);
+
+ if (len == 0)
+ return;
+ if (*(buf + len - 1) == '\n')
+ *(buf + len - 1) = '\0';
+}
+
/*
* kdb_local - The main code for kdb. This routine is invoked on a
* specific processor, it is not global. The main kdb() routine
@@ -1327,6 +1337,7 @@ do_full_getstr:
cmdptr = cmd_head;
diag = kdb_parse(cmdbuf);
if (diag == KDB_NOTFOUND) {
+ drop_newline(cmdbuf);
kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
diag = 0;
}
@@ -1566,6 +1577,7 @@ static int kdb_md(int argc, const char **argv)
int symbolic = 0;
int valid = 0;
int phys = 0;
+ int raw = 0;
kdbgetintenv("MDCOUNT", &mdcount);
kdbgetintenv("RADIX", &radix);
@@ -1575,9 +1587,10 @@ static int kdb_md(int argc, const char **argv)
repeat = mdcount * 16 / bytesperword;
if (strcmp(argv[0], "mdr") == 0) {
- if (argc != 2)
+ if (argc == 2 || (argc == 0 && last_addr != 0))
+ valid = raw = 1;
+ else
return KDB_ARGCOUNT;
- valid = 1;
} else if (isdigit(argv[0][2])) {
bytesperword = (int)(argv[0][2] - '0');
if (bytesperword == 0) {
@@ -1613,7 +1626,10 @@ static int kdb_md(int argc, const char **argv)
radix = last_radix;
bytesperword = last_bytesperword;
repeat = last_repeat;
- mdcount = ((repeat * bytesperword) + 15) / 16;
+ if (raw)
+ mdcount = repeat;
+ else
+ mdcount = ((repeat * bytesperword) + 15) / 16;
}
if (argc) {
@@ -1630,7 +1646,10 @@ static int kdb_md(int argc, const char **argv)
diag = kdbgetularg(argv[nextarg], &val);
if (!diag) {
mdcount = (int) val;
- repeat = mdcount * 16 / bytesperword;
+ if (raw)
+ repeat = mdcount;
+ else
+ repeat = mdcount * 16 / bytesperword;
}
}
if (argc >= nextarg+1) {
@@ -1640,8 +1659,15 @@ static int kdb_md(int argc, const char **argv)
}
}
- if (strcmp(argv[0], "mdr") == 0)
- return kdb_mdr(addr, mdcount);
+ if (strcmp(argv[0], "mdr") == 0) {
+ int ret;
+ last_addr = addr;
+ ret = kdb_mdr(addr, mdcount);
+ last_addr += mdcount;
+ last_repeat = mdcount;
+ last_bytesperword = bytesperword; // to make REPEAT happy
+ return ret;
+ }
switch (radix) {
case 10:
@@ -2473,41 +2499,6 @@ static int kdb_kill(int argc, const char **argv)
return 0;
}
-struct kdb_tm {
- int tm_sec; /* seconds */
- int tm_min; /* minutes */
- int tm_hour; /* hours */
- int tm_mday; /* day of the month */
- int tm_mon; /* month */
- int tm_year; /* year */
-};
-
-static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
-{
- /* This will work from 1970-2099, 2100 is not a leap year */
- static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
- 31, 30, 31, 30, 31 };
- memset(tm, 0, sizeof(*tm));
- tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
- tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
- (2 * 365 + 1); /* shift base from 1970 to 1968 */
- tm->tm_min = tm->tm_sec / 60 % 60;
- tm->tm_hour = tm->tm_sec / 60 / 60;
- tm->tm_sec = tm->tm_sec % 60;
- tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
- tm->tm_mday %= (4*365+1);
- mon_day[1] = 29;
- while (tm->tm_mday >= mon_day[tm->tm_mon]) {
- tm->tm_mday -= mon_day[tm->tm_mon];
- if (++tm->tm_mon == 12) {
- tm->tm_mon = 0;
- ++tm->tm_year;
- mon_day[1] = 28;
- }
- }
- ++tm->tm_mday;
-}
-
/*
* Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
* I cannot call that code directly from kdb, it has an unconditional
@@ -2515,10 +2506,10 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
*/
static void kdb_sysinfo(struct sysinfo *val)
{
- struct timespec uptime;
- ktime_get_ts(&uptime);
+ u64 uptime = ktime_get_mono_fast_ns();
+
memset(val, 0, sizeof(*val));
- val->uptime = uptime.tv_sec;
+ val->uptime = div_u64(uptime, NSEC_PER_SEC);
val->loads[0] = avenrun[0];
val->loads[1] = avenrun[1];
val->loads[2] = avenrun[2];
@@ -2533,8 +2524,8 @@ static void kdb_sysinfo(struct sysinfo *val)
*/
static int kdb_summary(int argc, const char **argv)
{
- struct timespec now;
- struct kdb_tm tm;
+ time64_t now;
+ struct tm tm;
struct sysinfo val;
if (argc)
@@ -2548,9 +2539,9 @@ static int kdb_summary(int argc, const char **argv)
kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
kdb_printf("ccversion %s\n", __stringify(CCVERSION));
- now = __current_kernel_time();
- kdb_gmtime(&now, &tm);
- kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
+ now = __ktime_get_real_seconds();
+ time64_to_tm(now, 0, &tm);
+ kdb_printf("date %04ld-%02d-%02d %02d:%02d:%02d "
"tz_minuteswest %d\n",
1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
tm.tm_hour, tm.tm_min, tm.tm_sec,
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index d35cc2d3a4cc..990b3cc526c8 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -129,13 +129,13 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
}
if (i >= ARRAY_SIZE(kdb_name_table)) {
debug_kfree(kdb_name_table[0]);
- memcpy(kdb_name_table, kdb_name_table+1,
+ memmove(kdb_name_table, kdb_name_table+1,
sizeof(kdb_name_table[0]) *
(ARRAY_SIZE(kdb_name_table)-1));
} else {
debug_kfree(knt1);
knt1 = kdb_name_table[i];
- memcpy(kdb_name_table+i, kdb_name_table+i+1,
+ memmove(kdb_name_table+i, kdb_name_table+i+1,
sizeof(kdb_name_table[0]) *
(ARRAY_SIZE(kdb_name_table)-i-1));
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index e2764d767f18..ca8ac2824f0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -44,23 +44,24 @@ void __delayacct_tsk_init(struct task_struct *tsk)
{
tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
if (tsk->delays)
- spin_lock_init(&tsk->delays->lock);
+ raw_spin_lock_init(&tsk->delays->lock);
}
/*
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
-static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
+ u32 *count)
{
s64 ns = ktime_get_ns() - *start;
unsigned long flags;
if (ns > 0) {
- spin_lock_irqsave(lock, flags);
+ raw_spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- spin_unlock_irqrestore(lock, flags);
+ raw_spin_unlock_irqrestore(lock, flags);
}
}
@@ -127,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
- spin_lock_irqsave(&tsk->delays->lock, flags);
+ raw_spin_lock_irqsave(&tsk->delays->lock, flags);
tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
@@ -137,7 +138,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
- spin_unlock_irqrestore(&tsk->delays->lock, flags);
+ raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
}
@@ -147,10 +148,10 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
__u64 ret;
unsigned long flags;
- spin_lock_irqsave(&tsk->delays->lock, flags);
+ raw_spin_lock_irqsave(&tsk->delays->lock, flags);
ret = nsec_to_clock_t(tsk->delays->blkio_delay +
tsk->delays->swapin_delay);
- spin_unlock_irqrestore(&tsk->delays->lock, flags);
+ raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return ret;
}
diff --git a/kernel/dma.c b/kernel/dma.c
index 3506fc34a712..40f152936316 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -135,21 +135,9 @@ static int proc_dma_show(struct seq_file *m, void *v)
}
#endif /* MAX_DMA_CHANNELS */
-static int proc_dma_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_dma_show, NULL);
-}
-
-static const struct file_operations proc_dma_operations = {
- .open = proc_dma_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_dma_init(void)
{
- proc_create("dma", 0, NULL, &proc_dma_operations);
+ proc_create_single("dma", 0, NULL, proc_dma_show);
return 0;
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 772a43fea825..c187aa3df3c8 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -119,23 +119,20 @@ int get_callchain_buffers(int event_max_stack)
goto exit;
}
- if (count > 1) {
- /* If the allocation failed, give up */
- if (!callchain_cpus_entries)
- err = -ENOMEM;
- /*
- * If requesting per event more than the global cap,
- * return a different error to help userspace figure
- * this out.
- *
- * And also do it here so that we have &callchain_mutex held.
- */
- if (event_max_stack > sysctl_perf_event_max_stack)
- err = -EOVERFLOW;
+ /*
+ * If requesting per event more than the global cap,
+ * return a different error to help userspace figure
+ * this out.
+ *
+ * And also do it here so that we have &callchain_mutex held.
+ */
+ if (event_max_stack > sysctl_perf_event_max_stack) {
+ err = -EOVERFLOW;
goto exit;
}
- err = alloc_callchain_buffers();
+ if (count == 1)
+ err = alloc_callchain_buffers();
exit:
if (err)
atomic_dec(&nr_callchain_events);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4b838470fac4..80cca2b30c4f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -430,7 +430,7 @@ static void update_perf_cpu_limits(void)
WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
-static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader)
{
struct perf_event *sibling;
- list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+ for_each_sibling_event(sibling, leader)
perf_event_update_time(sibling);
}
@@ -724,9 +724,15 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
{
- struct perf_cgroup *cgrp_out = cpuctx->cgrp;
- if (cgrp_out)
- __update_cgrp_time(cgrp_out);
+ struct perf_cgroup *cgrp = cpuctx->cgrp;
+ struct cgroup_subsys_state *css;
+
+ if (cgrp) {
+ for (css = &cgrp->css; css; css = css->parent) {
+ cgrp = container_of(css, struct perf_cgroup, css);
+ __update_cgrp_time(cgrp);
+ }
+ }
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
@@ -754,6 +760,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
{
struct perf_cgroup *cgrp;
struct perf_cgroup_info *info;
+ struct cgroup_subsys_state *css;
/*
* ctx->lock held by caller
@@ -764,8 +771,12 @@ perf_cgroup_set_timestamp(struct task_struct *task,
return;
cgrp = perf_cgroup_from_task(task, ctx);
- info = this_cpu_ptr(cgrp->info);
- info->timestamp = ctx->timestamp;
+
+ for (css = &cgrp->css; css; css = css->parent) {
+ cgrp = container_of(css, struct perf_cgroup, css);
+ info = this_cpu_ptr(cgrp->info);
+ info->timestamp = ctx->timestamp;
+ }
}
static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
@@ -937,27 +948,39 @@ list_update_cgroup_event(struct perf_event *event,
if (!is_cgroup_event(event))
return;
- if (add && ctx->nr_cgroups++)
- return;
- else if (!add && --ctx->nr_cgroups)
- return;
/*
* Because cgroup events are always per-cpu events,
* this will always be called from the right CPU.
*/
cpuctx = __get_cpu_context(ctx);
- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
- /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
- if (add) {
+
+ /*
+ * Since setting cpuctx->cgrp is conditional on the current @cgrp
+ * matching the event's cgroup, we must do this for every new event,
+ * because if the first would mismatch, the second would not try again
+ * and we would leave cpuctx->cgrp unset.
+ */
+ if (add && !cpuctx->cgrp) {
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
- list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
cpuctx->cgrp = cgrp;
- } else {
- list_del(cpuctx_entry);
- cpuctx->cgrp = NULL;
}
+
+ if (add && ctx->nr_cgroups++)
+ return;
+ else if (!add && --ctx->nr_cgroups)
+ return;
+
+ /* no cgroup running */
+ if (!add)
+ cpuctx->cgrp = NULL;
+
+ cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+ if (add)
+ list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+ else
+ list_del(cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1041,7 +1064,7 @@ list_update_cgroup_event(struct perf_event *event,
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
struct perf_cpu_context *cpuctx;
- int rotations = 0;
+ bool rotations;
lockdep_assert_irqs_disabled();
@@ -1460,8 +1483,21 @@ static enum event_type_t get_event_type(struct perf_event *event)
return event_type;
}
-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+/*
+ * Helper function to initialize event group nodes.
+ */
+static void init_event_group(struct perf_event *event)
+{
+ RB_CLEAR_NODE(&event->group_node);
+ event->group_index = 0;
+}
+
+/*
+ * Extract pinned or flexible groups from the context
+ * based on event attrs bits.
+ */
+static struct perf_event_groups *
+get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
if (event->attr.pinned)
return &ctx->pinned_groups;
@@ -1470,6 +1506,156 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
}
/*
+ * Helper function to initializes perf_event_group trees.
+ */
+static void perf_event_groups_init(struct perf_event_groups *groups)
+{
+ groups->tree = RB_ROOT;
+ groups->index = 0;
+}
+
+/*
+ * Compare function for event groups;
+ *
+ * Implements complex key that first sorts by CPU and then by virtual index
+ * which provides ordering when rotating groups for the same CPU.
+ */
+static bool
+perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+{
+ if (left->cpu < right->cpu)
+ return true;
+ if (left->cpu > right->cpu)
+ return false;
+
+ if (left->group_index < right->group_index)
+ return true;
+ if (left->group_index > right->group_index)
+ return false;
+
+ return false;
+}
+
+/*
+ * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
+ * key (see perf_event_groups_less). This places it last inside the CPU
+ * subtree.
+ */
+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+ struct perf_event *event)
+{
+ struct perf_event *node_event;
+ struct rb_node *parent;
+ struct rb_node **node;
+
+ event->group_index = ++groups->index;
+
+ node = &groups->tree.rb_node;
+ parent = *node;
+
+ while (*node) {
+ parent = *node;
+ node_event = container_of(*node, struct perf_event, group_node);
+
+ if (perf_event_groups_less(event, node_event))
+ node = &parent->rb_left;
+ else
+ node = &parent->rb_right;
+ }
+
+ rb_link_node(&event->group_node, parent, node);
+ rb_insert_color(&event->group_node, &groups->tree);
+}
+
+/*
+ * Helper function to insert event into the pinned or flexible groups.
+ */
+static void
+add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event_groups *groups;
+
+ groups = get_event_groups(event, ctx);
+ perf_event_groups_insert(groups, event);
+}
+
+/*
+ * Delete a group from a tree.
+ */
+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+ struct perf_event *event)
+{
+ WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
+ RB_EMPTY_ROOT(&groups->tree));
+
+ rb_erase(&event->group_node, &groups->tree);
+ init_event_group(event);
+}
+
+/*
+ * Helper function to delete event from its groups.
+ */
+static void
+del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event_groups *groups;
+
+ groups = get_event_groups(event, ctx);
+ perf_event_groups_delete(groups, event);
+}
+
+/*
+ * Get the leftmost event in the @cpu subtree.
+ */
+static struct perf_event *
+perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+{
+ struct perf_event *node_event = NULL, *match = NULL;
+ struct rb_node *node = groups->tree.rb_node;
+
+ while (node) {
+ node_event = container_of(node, struct perf_event, group_node);
+
+ if (cpu < node_event->cpu) {
+ node = node->rb_left;
+ } else if (cpu > node_event->cpu) {
+ node = node->rb_right;
+ } else {
+ match = node_event;
+ node = node->rb_left;
+ }
+ }
+
+ return match;
+}
+
+/*
+ * Like rb_entry_next_safe() for the @cpu subtree.
+ */
+static struct perf_event *
+perf_event_groups_next(struct perf_event *event)
+{
+ struct perf_event *next;
+
+ next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
+ if (next && next->cpu == event->cpu)
+ return next;
+
+ return NULL;
+}
+
+/*
+ * Iterate through the whole groups tree.
+ */
+#define perf_event_groups_for_each(event, groups) \
+ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
+ typeof(*event), group_node); event; \
+ event = rb_entry_safe(rb_next(&event->group_node), \
+ typeof(*event), group_node))
+
+/*
* Add a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
@@ -1489,12 +1675,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
* perf_group_detach can, at all times, locate all siblings.
*/
if (event->group_leader == event) {
- struct list_head *list;
-
event->group_caps = event->event_caps;
-
- list = ctx_group_list(event, ctx);
- list_add_tail(&event->group_entry, list);
+ add_event_to_groups(event, ctx);
}
list_update_cgroup_event(event, ctx, true);
@@ -1652,12 +1834,12 @@ static void perf_group_attach(struct perf_event *event)
group_leader->group_caps &= event->event_caps;
- list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ list_add_tail(&event->sibling_list, &group_leader->sibling_list);
group_leader->nr_siblings++;
perf_event__header_size(group_leader);
- list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+ for_each_sibling_event(pos, group_leader)
perf_event__header_size(pos);
}
@@ -1688,7 +1870,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
list_del_rcu(&event->event_entry);
if (event->group_leader == event)
- list_del_init(&event->group_entry);
+ del_event_from_groups(event, ctx);
/*
* If event was in error state, then keep it
@@ -1706,9 +1888,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
- struct list_head *list = NULL;
+ struct perf_event_context *ctx = event->ctx;
- lockdep_assert_held(&event->ctx->lock);
+ lockdep_assert_held(&ctx->lock);
/*
* We can have double detach due to exit/hot-unplug + close.
@@ -1722,34 +1904,42 @@ static void perf_group_detach(struct perf_event *event)
* If this is a sibling, remove it from its group.
*/
if (event->group_leader != event) {
- list_del_init(&event->group_entry);
+ list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
goto out;
}
- if (!list_empty(&event->group_entry))
- list = &event->group_entry;
-
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
* to whatever list we are on.
*/
- list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
- if (list)
- list_move_tail(&sibling->group_entry, list);
+ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+
sibling->group_leader = sibling;
+ list_del_init(&sibling->sibling_list);
/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;
+ if (!RB_EMPTY_NODE(&event->group_node)) {
+ add_event_to_groups(sibling, event->ctx);
+
+ if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
+ struct list_head *list = sibling->attr.pinned ?
+ &ctx->pinned_active : &ctx->flexible_active;
+
+ list_add_tail(&sibling->active_list, list);
+ }
+ }
+
WARN_ON_ONCE(sibling->ctx != event->ctx);
}
out:
perf_event__header_size(event->group_leader);
- list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+ for_each_sibling_event(tmp, event->group_leader)
perf_event__header_size(tmp);
}
@@ -1772,13 +1962,13 @@ static inline int __pmu_filter_match(struct perf_event *event)
*/
static inline int pmu_filter_match(struct perf_event *event)
{
- struct perf_event *child;
+ struct perf_event *sibling;
if (!__pmu_filter_match(event))
return 0;
- list_for_each_entry(child, &event->sibling_list, group_entry) {
- if (!__pmu_filter_match(child))
+ for_each_sibling_event(sibling, event) {
+ if (!__pmu_filter_match(sibling))
return 0;
}
@@ -1805,6 +1995,13 @@ event_sched_out(struct perf_event *event,
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
+ /*
+ * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
+ * we can schedule events _OUT_ individually through things like
+ * __perf_remove_from_context().
+ */
+ list_del_init(&event->active_list);
+
perf_pmu_disable(event->pmu);
event->pmu->del(event, 0);
@@ -1845,7 +2042,7 @@ group_sched_out(struct perf_event *group_event,
/*
* Schedule out siblings (if any):
*/
- list_for_each_entry(event, &group_event->sibling_list, group_entry)
+ for_each_sibling_event(event, group_event)
event_sched_out(event, cpuctx, ctx);
perf_pmu_enable(ctx->pmu);
@@ -2124,7 +2321,7 @@ group_sched_in(struct perf_event *group_event,
/*
* Schedule in siblings as one group (if any):
*/
- list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ for_each_sibling_event(event, group_event) {
if (event_sched_in(event, cpuctx, ctx)) {
partial_group = event;
goto group_error;
@@ -2140,7 +2337,7 @@ group_error:
* partial group before returning:
* The events up to the failed event are scheduled out normally.
*/
- list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ for_each_sibling_event(event, group_event) {
if (event == partial_group)
break;
@@ -2317,6 +2514,18 @@ static int __perf_install_in_context(void *info)
raw_spin_lock(&task_ctx->lock);
}
+#ifdef CONFIG_CGROUP_PERF
+ if (is_cgroup_event(event)) {
+ /*
+ * If the current cgroup doesn't match the event's
+ * cgroup, we should not try to schedule it.
+ */
+ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
+ reprogram = cgroup_is_descendant(cgrp->css.cgroup,
+ event->cgrp->css.cgroup);
+ }
+#endif
+
if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
@@ -2650,12 +2859,47 @@ int perf_event_refresh(struct perf_event *event, int refresh)
}
EXPORT_SYMBOL_GPL(perf_event_refresh);
+static int perf_event_modify_breakpoint(struct perf_event *bp,
+ struct perf_event_attr *attr)
+{
+ int err;
+
+ _perf_event_disable(bp);
+
+ err = modify_user_hw_breakpoint_check(bp, attr, true);
+ if (err) {
+ if (!bp->attr.disabled)
+ _perf_event_enable(bp);
+
+ return err;
+ }
+
+ if (!attr->disabled)
+ _perf_event_enable(bp);
+ return 0;
+}
+
+static int perf_event_modify_attr(struct perf_event *event,
+ struct perf_event_attr *attr)
+{
+ if (event->attr.type != attr->type)
+ return -EINVAL;
+
+ switch (event->attr.type) {
+ case PERF_TYPE_BREAKPOINT:
+ return perf_event_modify_breakpoint(event, attr);
+ default:
+ /* Place holder for future additions. */
+ return -EOPNOTSUPP;
+ }
+}
+
static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
+ struct perf_event *event, *tmp;
int is_active = ctx->is_active;
- struct perf_event *event;
lockdep_assert_held(&ctx->lock);
@@ -2702,12 +2946,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
- list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+ list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
group_sched_out(event, cpuctx, ctx);
}
if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+ list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
group_sched_out(event, cpuctx, ctx);
}
perf_pmu_enable(ctx->pmu);
@@ -2994,53 +3238,116 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
+ int (*func)(struct perf_event *, void *), void *data)
{
- struct perf_event *event;
+ struct perf_event **evt, *evt1, *evt2;
+ int ret;
- list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- if (!event_filter_match(event))
- continue;
+ evt1 = perf_event_groups_first(groups, -1);
+ evt2 = perf_event_groups_first(groups, cpu);
+
+ while (evt1 || evt2) {
+ if (evt1 && evt2) {
+ if (evt1->group_index < evt2->group_index)
+ evt = &evt1;
+ else
+ evt = &evt2;
+ } else if (evt1) {
+ evt = &evt1;
+ } else {
+ evt = &evt2;
+ }
- if (group_can_go_on(event, cpuctx, 1))
- group_sched_in(event, cpuctx, ctx);
+ ret = func(*evt, data);
+ if (ret)
+ return ret;
- /*
- * If this pinned group hasn't been scheduled,
- * put it in error state.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ *evt = perf_event_groups_next(*evt);
+ }
+
+ return 0;
+}
+
+struct sched_in_data {
+ struct perf_event_context *ctx;
+ struct perf_cpu_context *cpuctx;
+ int can_add_hw;
+};
+
+static int pinned_sched_in(struct perf_event *event, void *data)
+{
+ struct sched_in_data *sid = data;
+
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+
+ if (!event_filter_match(event))
+ return 0;
+
+ if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
+ if (!group_sched_in(event, sid->cpuctx, sid->ctx))
+ list_add_tail(&event->active_list, &sid->ctx->pinned_active);
+ }
+
+ /*
+ * If this pinned group hasn't been scheduled,
+ * put it in error state.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+
+ return 0;
+}
+
+static int flexible_sched_in(struct perf_event *event, void *data)
+{
+ struct sched_in_data *sid = data;
+
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+
+ if (!event_filter_match(event))
+ return 0;
+
+ if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
+ if (!group_sched_in(event, sid->cpuctx, sid->ctx))
+ list_add_tail(&event->active_list, &sid->ctx->flexible_active);
+ else
+ sid->can_add_hw = 0;
}
+
+ return 0;
+}
+
+static void
+ctx_pinned_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct sched_in_data sid = {
+ .ctx = ctx,
+ .cpuctx = cpuctx,
+ .can_add_hw = 1,
+ };
+
+ visit_groups_merge(&ctx->pinned_groups,
+ smp_processor_id(),
+ pinned_sched_in, &sid);
}
static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
- struct perf_event *event;
- int can_add_hw = 1;
-
- list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
- /* Ignore events in OFF or ERROR state */
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- /*
- * Listen to the 'cpu' scheduling filter constraint
- * of events:
- */
- if (!event_filter_match(event))
- continue;
+ struct sched_in_data sid = {
+ .ctx = ctx,
+ .cpuctx = cpuctx,
+ .can_add_hw = 1,
+ };
- if (group_can_go_on(event, cpuctx, can_add_hw)) {
- if (group_sched_in(event, cpuctx, ctx))
- can_add_hw = 0;
- }
- }
+ visit_groups_merge(&ctx->flexible_groups,
+ smp_processor_id(),
+ flexible_sched_in, &sid);
}
static void
@@ -3121,7 +3428,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!list_empty(&ctx->pinned_groups))
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
@@ -3350,55 +3657,81 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
}
/*
- * Round-robin a context's events:
+ * Move @event to the tail of the @ctx's elegible events.
*/
-static void rotate_ctx(struct perf_event_context *ctx)
+static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{
/*
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
*/
- if (!ctx->rotate_disable)
- list_rotate_left(&ctx->flexible_groups);
+ if (ctx->rotate_disable)
+ return;
+
+ perf_event_groups_delete(&ctx->flexible_groups, event);
+ perf_event_groups_insert(&ctx->flexible_groups, event);
+}
+
+static inline struct perf_event *
+ctx_first_active(struct perf_event_context *ctx)
+{
+ return list_first_entry_or_null(&ctx->flexible_active,
+ struct perf_event, active_list);
}
-static int perf_rotate_context(struct perf_cpu_context *cpuctx)
+static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
{
+ struct perf_event *cpu_event = NULL, *task_event = NULL;
+ bool cpu_rotate = false, task_rotate = false;
struct perf_event_context *ctx = NULL;
- int rotate = 0;
+
+ /*
+ * Since we run this from IRQ context, nobody can install new
+ * events, thus the event count values are stable.
+ */
if (cpuctx->ctx.nr_events) {
if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
- rotate = 1;
+ cpu_rotate = true;
}
ctx = cpuctx->task_ctx;
if (ctx && ctx->nr_events) {
if (ctx->nr_events != ctx->nr_active)
- rotate = 1;
+ task_rotate = true;
}
- if (!rotate)
- goto done;
+ if (!(cpu_rotate || task_rotate))
+ return false;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- if (ctx)
+ if (task_rotate)
+ task_event = ctx_first_active(ctx);
+ if (cpu_rotate)
+ cpu_event = ctx_first_active(&cpuctx->ctx);
+
+ /*
+ * As per the order given at ctx_resched() first 'pop' task flexible
+ * and then, if needed CPU flexible.
+ */
+ if (task_event || (ctx && cpu_event))
ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ if (cpu_event)
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- rotate_ctx(&cpuctx->ctx);
- if (ctx)
- rotate_ctx(ctx);
+ if (task_event)
+ rotate_ctx(ctx, task_event);
+ if (cpu_event)
+ rotate_ctx(&cpuctx->ctx, cpu_event);
perf_event_sched_in(cpuctx, ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-done:
- return rotate;
+ return true;
}
void perf_event_task_tick(void)
@@ -3543,7 +3876,7 @@ static void __perf_event_read(void *info)
pmu->read(event);
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ for_each_sibling_event(sub, event) {
if (sub->state == PERF_EVENT_STATE_ACTIVE) {
/*
* Use sibling's PMU rather than @event's since
@@ -3717,9 +4050,11 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->active_ctx_list);
- INIT_LIST_HEAD(&ctx->pinned_groups);
- INIT_LIST_HEAD(&ctx->flexible_groups);
+ perf_event_groups_init(&ctx->pinned_groups);
+ perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
+ INIT_LIST_HEAD(&ctx->pinned_active);
+ INIT_LIST_HEAD(&ctx->flexible_active);
atomic_set(&ctx->refcount, 1);
}
@@ -4112,6 +4447,9 @@ static void _free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
+ if (event->hw.target)
+ put_task_struct(event->hw.target);
+
exclusive_event_destroy(event);
module_put(event->pmu->module);
@@ -4389,7 +4727,7 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
@@ -4583,7 +4921,7 @@ static void perf_event_for_each(struct perf_event *event,
event = event->group_leader;
perf_event_for_each_child(event, func);
- list_for_each_entry(sibling, &event->sibling_list, group_entry)
+ for_each_sibling_event(sibling, event)
perf_event_for_each_child(sibling, func);
}
@@ -4665,6 +5003,8 @@ static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+static int perf_copy_attr(struct perf_event_attr __user *uattr,
+ struct perf_event_attr *attr);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
@@ -4737,6 +5077,17 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_QUERY_BPF:
return perf_event_query_prog_array(event, (void __user *)arg);
+
+ case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
+ struct perf_event_attr new_attr;
+ int err = perf_copy_attr((struct perf_event_attr __user *)arg,
+ &new_attr);
+
+ if (err)
+ return err;
+
+ return perf_event_modify_attr(event, &new_attr);
+ }
default:
return -ENOTTY;
}
@@ -4769,6 +5120,8 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
switch (_IOC_NR(cmd)) {
case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
case _IOC_NR(PERF_EVENT_IOC_ID):
+ case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
+ case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
cmd &= ~IOCSIZE_MASK;
@@ -5732,7 +6085,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
- if (leader != event)
+ if ((leader != event) &&
+ (leader->state == PERF_EVENT_STATE_ACTIVE))
leader->pmu->read(leader);
values[n++] = perf_event_count(leader);
@@ -5741,7 +6095,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
__output_copy(handle, values, n * sizeof(u64));
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ for_each_sibling_event(sub, leader) {
n = 0;
if ((sub != event) &&
@@ -6316,7 +6670,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- if (filter->inode) {
+ if (filter->path.dentry) {
event->addr_filters_offs[count] = 0;
restart++;
}
@@ -6981,7 +7335,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
struct file *file, unsigned long offset,
unsigned long size)
{
- if (filter->inode != file_inode(file))
+ if (d_inode(filter->path.dentry) != file_inode(file))
return false;
if (filter->offset > offset + size)
@@ -7235,6 +7589,10 @@ static void perf_event_switch(struct task_struct *task,
},
};
+ if (!sched_in && task->state == TASK_RUNNING)
+ switch_event.event_id.header.misc |=
+ PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
+
perf_iterate_sb(perf_event_switch_output,
&switch_event,
NULL);
@@ -7998,9 +8356,127 @@ static struct pmu perf_tracepoint = {
.read = perf_swevent_read,
};
+#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
+/*
+ * Flags in config, used by dynamic PMU kprobe and uprobe
+ * The flags should match following PMU_FORMAT_ATTR().
+ *
+ * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
+ * if not set, create kprobe/uprobe
+ */
+enum perf_probe_config {
+ PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
+};
+
+PMU_FORMAT_ATTR(retprobe, "config:0");
+
+static struct attribute *probe_attrs[] = {
+ &format_attr_retprobe.attr,
+ NULL,
+};
+
+static struct attribute_group probe_format_group = {
+ .name = "format",
+ .attrs = probe_attrs,
+};
+
+static const struct attribute_group *probe_attr_groups[] = {
+ &probe_format_group,
+ NULL,
+};
+#endif
+
+#ifdef CONFIG_KPROBE_EVENTS
+static int perf_kprobe_event_init(struct perf_event *event);
+static struct pmu perf_kprobe = {
+ .task_ctx_nr = perf_sw_context,
+ .event_init = perf_kprobe_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+ .attr_groups = probe_attr_groups,
+};
+
+static int perf_kprobe_event_init(struct perf_event *event)
+{
+ int err;
+ bool is_retprobe;
+
+ if (event->attr.type != perf_kprobe.type)
+ return -ENOENT;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ /*
+ * no branch sampling for probe events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
+ err = perf_kprobe_init(event, is_retprobe);
+ if (err)
+ return err;
+
+ event->destroy = perf_kprobe_destroy;
+
+ return 0;
+}
+#endif /* CONFIG_KPROBE_EVENTS */
+
+#ifdef CONFIG_UPROBE_EVENTS
+static int perf_uprobe_event_init(struct perf_event *event);
+static struct pmu perf_uprobe = {
+ .task_ctx_nr = perf_sw_context,
+ .event_init = perf_uprobe_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+ .attr_groups = probe_attr_groups,
+};
+
+static int perf_uprobe_event_init(struct perf_event *event)
+{
+ int err;
+ bool is_retprobe;
+
+ if (event->attr.type != perf_uprobe.type)
+ return -ENOENT;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ /*
+ * no branch sampling for probe events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
+ err = perf_uprobe_init(event, is_retprobe);
+ if (err)
+ return err;
+
+ event->destroy = perf_uprobe_destroy;
+
+ return 0;
+}
+#endif /* CONFIG_UPROBE_EVENTS */
+
static inline void perf_tp_register(void)
{
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+#ifdef CONFIG_KPROBE_EVENTS
+ perf_pmu_register(&perf_kprobe, "kprobe", -1);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ perf_pmu_register(&perf_uprobe, "uprobe", -1);
+#endif
}
static void perf_event_free_filter(struct perf_event *event)
@@ -8077,13 +8553,32 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
}
#endif
+/*
+ * returns true if the event is a tracepoint, or a kprobe/upprobe created
+ * with perf_event_open()
+ */
+static inline bool perf_event_is_tracing(struct perf_event *event)
+{
+ if (event->pmu == &perf_tracepoint)
+ return true;
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->pmu == &perf_kprobe)
+ return true;
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->pmu == &perf_uprobe)
+ return true;
+#endif
+ return false;
+}
+
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
int ret;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ if (!perf_event_is_tracing(event))
return perf_event_set_bpf_handler(event, prog_fd);
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
@@ -8129,7 +8624,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
static void perf_event_free_bpf_prog(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+ if (!perf_event_is_tracing(event)) {
perf_event_free_bpf_handler(event);
return;
}
@@ -8193,8 +8688,7 @@ static void free_filters_list(struct list_head *filters)
struct perf_addr_filter *filter, *iter;
list_for_each_entry_safe(filter, iter, filters, entry) {
- if (filter->inode)
- iput(filter->inode);
+ path_put(&filter->path);
list_del(&filter->entry);
kfree(filter);
}
@@ -8291,7 +8785,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
* Adjust base offset if the filter is associated to a binary
* that needs to be mapped:
*/
- if (filter->inode)
+ if (filter->path.dentry)
event->addr_filters_offs[count] =
perf_addr_filter_apply(filter, mm);
@@ -8325,7 +8819,8 @@ restart:
* * for kernel addresses: <start address>[/<size>]
* * for object files: <start address>[/<size>]@</path/to/object/file>
*
- * if <size> is not specified, the range is treated as a single address.
+ * if <size> is not specified or is zero, the range is treated as a single
+ * address; not valid for ACTION=="filter".
*/
enum {
IF_ACT_NONE = -1,
@@ -8364,7 +8859,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
{
struct perf_addr_filter *filter = NULL;
char *start, *orig, *filename = NULL;
- struct path path;
substring_t args[MAX_OPT_ARGS];
int state = IF_STATE_ACTION, token;
unsigned int kernel = 0;
@@ -8375,6 +8869,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
return -ENOMEM;
while ((start = strsep(&fstr, " ,\n")) != NULL) {
+ static const enum perf_addr_filter_action_t actions[] = {
+ [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
+ [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
+ [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
+ };
ret = -EINVAL;
if (!*start)
@@ -8391,12 +8890,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
switch (token) {
case IF_ACT_FILTER:
case IF_ACT_START:
- filter->filter = 1;
-
case IF_ACT_STOP:
if (state != IF_STATE_ACTION)
goto fail;
+ filter->action = actions[token];
state = IF_STATE_SOURCE;
break;
@@ -8409,15 +8907,12 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
if (state != IF_STATE_SOURCE)
goto fail;
- if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
- filter->range = 1;
-
*args[0].to = 0;
ret = kstrtoul(args[0].from, 0, &filter->offset);
if (ret)
goto fail;
- if (filter->range) {
+ if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
*args[1].to = 0;
ret = kstrtoul(args[1].from, 0, &filter->size);
if (ret)
@@ -8425,7 +8920,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
}
if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
- int fpos = filter->range ? 2 : 1;
+ int fpos = token == IF_SRC_FILE ? 2 : 1;
filename = match_strdup(&args[fpos]);
if (!filename) {
@@ -8451,6 +8946,14 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
if (kernel && event->attr.exclude_kernel)
goto fail;
+ /*
+ * ACTION "filter" must have a non-zero length region
+ * specified.
+ */
+ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
+ !filter->size)
+ goto fail;
+
if (!kernel) {
if (!filename)
goto fail;
@@ -8468,19 +8971,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
goto fail_free_name;
/* look up the path and grab its inode */
- ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ ret = kern_path(filename, LOOKUP_FOLLOW,
+ &filter->path);
if (ret)
goto fail_free_name;
- filter->inode = igrab(d_inode(path.dentry));
- path_put(&path);
kfree(filename);
filename = NULL;
ret = -EINVAL;
- if (!filter->inode ||
- !S_ISREG(filter->inode->i_mode))
- /* free_filters_list() will iput() */
+ if (!filter->path.dentry ||
+ !S_ISREG(d_inode(filter->path.dentry)
+ ->i_mode))
goto fail;
event->addr_filters.nr_file_filters++;
@@ -8548,47 +9050,36 @@ fail_clear_files:
return ret;
}
-static int
-perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
-{
- struct perf_event_context *ctx = event->ctx;
- int ret;
-
- /*
- * Beware, here be dragons!!
- *
- * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
- * stuff does not actually need it. So temporarily drop ctx->mutex. As per
- * perf_event_ctx_lock() we already have a reference on ctx.
- *
- * This can result in event getting moved to a different ctx, but that
- * does not affect the tracepoint state.
- */
- mutex_unlock(&ctx->mutex);
- ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
- mutex_lock(&ctx->mutex);
-
- return ret;
-}
-
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
- char *filter_str;
int ret = -EINVAL;
-
- if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
- !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
- !has_addr_filter(event))
- return -EINVAL;
+ char *filter_str;
filter_str = strndup_user(arg, PAGE_SIZE);
if (IS_ERR(filter_str))
return PTR_ERR(filter_str);
- if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
- event->attr.type == PERF_TYPE_TRACEPOINT)
- ret = perf_tracepoint_set_filter(event, filter_str);
- else if (has_addr_filter(event))
+#ifdef CONFIG_EVENT_TRACING
+ if (perf_event_is_tracing(event)) {
+ struct perf_event_context *ctx = event->ctx;
+
+ /*
+ * Beware, here be dragons!!
+ *
+ * the tracepoint muck will deadlock against ctx->mutex, but
+ * the tracepoint stuff does not actually need it. So
+ * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
+ * already have a reference on ctx.
+ *
+ * This can result in event getting moved to a different ctx,
+ * but that does not affect the tracepoint state.
+ */
+ mutex_unlock(&ctx->mutex);
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ mutex_lock(&ctx->mutex);
+ } else
+#endif
+ if (has_addr_filter(event))
ret = perf_event_set_addr_filter(event, filter_str);
kfree(filter_str);
@@ -9441,9 +9932,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
mutex_init(&event->child_mutex);
INIT_LIST_HEAD(&event->child_list);
- INIT_LIST_HEAD(&event->group_entry);
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
+ INIT_LIST_HEAD(&event->active_list);
+ init_event_group(event);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
@@ -9477,6 +9969,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* and we cannot use the ctx information because we need the
* pmu before we get a ctx.
*/
+ get_task_struct(task);
event->hw.target = task;
}
@@ -9592,6 +10085,8 @@ err_ns:
perf_detach_cgroup(event);
if (event->ns)
put_pid_ns(event->ns);
+ if (event->hw.target)
+ put_task_struct(event->hw.target);
kfree(event);
return ERR_PTR(err);
@@ -9713,11 +10208,14 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
* __u16 sample size limit.
*/
if (attr->sample_stack_user >= USHRT_MAX)
- ret = -EINVAL;
+ return -EINVAL;
else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
- ret = -EINVAL;
+ return -EINVAL;
}
+ if (!attr->sample_max_stack)
+ attr->sample_max_stack = sysctl_perf_event_max_stack;
+
if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
out:
@@ -9931,9 +10429,6 @@ SYSCALL_DEFINE5(perf_event_open,
perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EACCES;
- if (!attr.sample_max_stack)
- attr.sample_max_stack = sysctl_perf_event_max_stack;
-
/*
* In cgroup mode, the pid argument is used to pass the fd
* opened to the cgroup directory in cgroupfs. The cpu argument
@@ -10025,19 +10520,20 @@ SYSCALL_DEFINE5(perf_event_open,
if (pmu->task_ctx_nr == perf_sw_context)
event->event_caps |= PERF_EV_CAP_SOFTWARE;
- if (group_leader &&
- (is_software_event(event) != is_software_event(group_leader))) {
- if (is_software_event(event)) {
+ if (group_leader) {
+ if (is_software_event(event) &&
+ !in_software_context(group_leader)) {
/*
- * If event and group_leader are not both a software
- * event, and event is, then group leader is not.
+ * If the event is a sw event, but the group_leader
+ * is on hw context.
*
- * Allow the addition of software events to !software
- * groups, this is safe because software events never
- * fail to schedule.
+ * Allow the addition of software events to hw
+ * groups, this is safe because software events
+ * never fail to schedule.
*/
- pmu = group_leader->pmu;
- } else if (is_software_event(group_leader) &&
+ pmu = group_leader->ctx->pmu;
+ } else if (!is_software_event(event) &&
+ is_software_event(group_leader) &&
(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
/*
* In case the group is a pure software group, and we
@@ -10207,8 +10703,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_remove_from_context(group_leader, 0);
put_ctx(gctx);
- list_for_each_entry(sibling, &group_leader->sibling_list,
- group_entry) {
+ for_each_sibling_event(sibling, group_leader) {
perf_remove_from_context(sibling, 0);
put_ctx(gctx);
}
@@ -10229,8 +10724,7 @@ SYSCALL_DEFINE5(perf_event_open,
* By installing siblings first we NO-OP because they're not
* reachable through the group lists.
*/
- list_for_each_entry(sibling, &group_leader->sibling_list,
- group_entry) {
+ for_each_sibling_event(sibling, group_leader) {
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
get_ctx(ctx);
@@ -10718,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
return file;
}
+const struct perf_event *perf_get_event(struct file *file)
+{
+ if (file->f_op != &perf_fops)
+ return ERR_PTR(-EINVAL);
+
+ return file->private_data;
+}
+
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
if (!event)
@@ -10869,7 +11371,7 @@ static int inherit_group(struct perf_event *parent_event,
* case inherit_event() will create individual events, similar to what
* perf_group_detach() would do anyway.
*/
- list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+ for_each_sibling_event(sub, parent_event) {
child_ctr = inherit_event(sub, parent, parent_ctx,
child, leader, child_ctx);
if (IS_ERR(child_ctr))
@@ -10968,7 +11470,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
* We dont have to disable NMIs - we are only looking at
* the list, not manipulating it:
*/
- list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
@@ -10984,7 +11486,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
parent_ctx->rotate_disable = 1;
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
- list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+ perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 3f8cb1e14588..6e28d2866be5 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -44,6 +44,7 @@
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/bug.h>
#include <linux/hw_breakpoint.h>
/*
@@ -85,9 +86,9 @@ __weak int hw_breakpoint_weight(struct perf_event *bp)
return 1;
}
-static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+static inline enum bp_type_idx find_slot_idx(u64 bp_type)
{
- if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+ if (bp_type & HW_BREAKPOINT_RW)
return TYPE_DATA;
return TYPE_INST;
@@ -122,7 +123,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
if (iter->hw.target == tsk &&
- find_slot_idx(iter) == type &&
+ find_slot_idx(iter->attr.bp_type) == type &&
(iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
}
@@ -277,7 +278,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
* ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
* + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
*/
-static int __reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
{
struct bp_busy_slots slots = {0};
enum bp_type_idx type;
@@ -288,11 +289,11 @@ static int __reserve_bp_slot(struct perf_event *bp)
return -ENOMEM;
/* Basic checks */
- if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
- bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+ if (bp_type == HW_BREAKPOINT_EMPTY ||
+ bp_type == HW_BREAKPOINT_INVALID)
return -EINVAL;
- type = find_slot_idx(bp);
+ type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
fetch_bp_busy_slots(&slots, bp, type);
@@ -317,19 +318,19 @@ int reserve_bp_slot(struct perf_event *bp)
mutex_lock(&nr_bp_mutex);
- ret = __reserve_bp_slot(bp);
+ ret = __reserve_bp_slot(bp, bp->attr.bp_type);
mutex_unlock(&nr_bp_mutex);
return ret;
}
-static void __release_bp_slot(struct perf_event *bp)
+static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
{
enum bp_type_idx type;
int weight;
- type = find_slot_idx(bp);
+ type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
toggle_bp_slot(bp, false, type, weight);
}
@@ -339,11 +340,43 @@ void release_bp_slot(struct perf_event *bp)
mutex_lock(&nr_bp_mutex);
arch_unregister_hw_breakpoint(bp);
- __release_bp_slot(bp);
+ __release_bp_slot(bp, bp->attr.bp_type);
mutex_unlock(&nr_bp_mutex);
}
+static int __modify_bp_slot(struct perf_event *bp, u64 old_type)
+{
+ int err;
+
+ __release_bp_slot(bp, old_type);
+
+ err = __reserve_bp_slot(bp, bp->attr.bp_type);
+ if (err) {
+ /*
+ * Reserve the old_type slot back in case
+ * there's no space for the new type.
+ *
+ * This must succeed, because we just released
+ * the old_type slot in the __release_bp_slot
+ * call above. If not, something is broken.
+ */
+ WARN_ON(__reserve_bp_slot(bp, old_type));
+ }
+
+ return err;
+}
+
+static int modify_bp_slot(struct perf_event *bp, u64 old_type)
+{
+ int ret;
+
+ mutex_lock(&nr_bp_mutex);
+ ret = __modify_bp_slot(bp, old_type);
+ mutex_unlock(&nr_bp_mutex);
+ return ret;
+}
+
/*
* Allow the kernel debugger to reserve breakpoint slots without
* taking a lock using the dbg_* variant of for the reserve and
@@ -354,7 +387,7 @@ int dbg_reserve_bp_slot(struct perf_event *bp)
if (mutex_is_locked(&nr_bp_mutex))
return -1;
- return __reserve_bp_slot(bp);
+ return __reserve_bp_slot(bp, bp->attr.bp_type);
}
int dbg_release_bp_slot(struct perf_event *bp)
@@ -362,7 +395,7 @@ int dbg_release_bp_slot(struct perf_event *bp)
if (mutex_is_locked(&nr_bp_mutex))
return -1;
- __release_bp_slot(bp);
+ __release_bp_slot(bp, bp->attr.bp_type);
return 0;
}
@@ -423,20 +456,45 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
}
EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+int
+modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
+ bool check)
+{
+ u64 old_addr = bp->attr.bp_addr;
+ u64 old_len = bp->attr.bp_len;
+ int old_type = bp->attr.bp_type;
+ bool modify = attr->bp_type != old_type;
+ int err = 0;
+
+ bp->attr.bp_addr = attr->bp_addr;
+ bp->attr.bp_type = attr->bp_type;
+ bp->attr.bp_len = attr->bp_len;
+
+ if (check && memcmp(&bp->attr, attr, sizeof(*attr)))
+ return -EINVAL;
+
+ err = validate_hw_breakpoint(bp);
+ if (!err && modify)
+ err = modify_bp_slot(bp, old_type);
+
+ if (err) {
+ bp->attr.bp_addr = old_addr;
+ bp->attr.bp_type = old_type;
+ bp->attr.bp_len = old_len;
+ return err;
+ }
+
+ bp->attr.disabled = attr->disabled;
+ return 0;
+}
+
/**
* modify_user_hw_breakpoint - modify a user-space hardware breakpoint
* @bp: the breakpoint structure to modify
* @attr: new breakpoint attributes
- * @triggered: callback to trigger when we hit the breakpoint
- * @tsk: pointer to 'task_struct' of the process to which the address belongs
*/
int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
{
- u64 old_addr = bp->attr.bp_addr;
- u64 old_len = bp->attr.bp_len;
- int old_type = bp->attr.bp_type;
- int err = 0;
-
/*
* modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
* will not be possible to raise IPIs that invoke __perf_event_disable.
@@ -448,30 +506,14 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
else
perf_event_disable(bp);
- bp->attr.bp_addr = attr->bp_addr;
- bp->attr.bp_type = attr->bp_type;
- bp->attr.bp_len = attr->bp_len;
-
- if (attr->disabled)
- goto end;
+ if (!attr->disabled) {
+ int err = modify_user_hw_breakpoint_check(bp, attr, false);
- err = validate_hw_breakpoint(bp);
- if (!err)
+ if (err)
+ return err;
perf_event_enable(bp);
-
- if (err) {
- bp->attr.bp_addr = old_addr;
- bp->attr.bp_type = old_type;
- bp->attr.bp_len = old_len;
- if (!bp->attr.disabled)
- perf_event_enable(bp);
-
- return err;
+ bp->attr.disabled = 0;
}
-
-end:
- bp->attr.disabled = attr->disabled;
-
return 0;
}
EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6c6b3c48db71..1d8ca9ea9979 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/circ_buf.h>
#include <linux/poll.h>
+#include <linux/nospec.h>
#include "internal.h"
@@ -867,8 +868,10 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
return NULL;
/* AUX space */
- if (pgoff >= rb->aux_pgoff)
- return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+ if (pgoff >= rb->aux_pgoff) {
+ int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
+ return virt_to_page(rb->aux_pages[aux_pgoff]);
+ }
}
return __perf_mmap_to_page(rb, pgoff);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ce6848e46e94..1725b902983f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -491,7 +491,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
if (!uprobe)
return NULL;
- uprobe->inode = igrab(inode);
+ uprobe->inode = inode;
uprobe->offset = offset;
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
@@ -502,7 +502,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
if (cur_uprobe) {
kfree(uprobe);
uprobe = cur_uprobe;
- iput(inode);
}
return uprobe;
@@ -701,7 +700,6 @@ static void delete_uprobe(struct uprobe *uprobe)
rb_erase(&uprobe->rb_node, &uprobes_tree);
spin_unlock(&uprobes_treelock);
RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
- iput(uprobe->inode);
put_uprobe(uprobe);
}
@@ -873,7 +871,8 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
- * unregisters.
+ * unregisters. Caller of uprobe_register() is required to keep @inode
+ * (and the containing mount) referenced.
*
* Return errno if it cannot successully install probes
* else return 0 (success)
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0975b0268545..33f07c5f2515 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -19,7 +19,6 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
-#include <linux/fs_struct.h>
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -28,21 +27,9 @@ static int execdomains_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int execdomains_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, execdomains_proc_show, NULL);
-}
-
-static const struct file_operations execdomains_proc_fops = {
- .open = execdomains_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_execdomains_init(void)
{
- proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
+ proc_create_single("execdomains", 0, NULL, execdomains_proc_show);
return 0;
}
module_init(proc_execdomains_init);
diff --git a/kernel/exit.c b/kernel/exit.c
index 995453d9fb55..c3c7ac560114 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1691,7 +1691,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
*/
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
{
- return sys_wait4(pid, stat_addr, options, NULL);
+ return kernel_wait4(pid, stat_addr, options, NULL);
}
#endif
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 21b0122cb39c..1d5632d8bbcc 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -14,6 +14,15 @@
static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
+static void fei_post_handler(struct kprobe *kp, struct pt_regs *regs,
+ unsigned long flags)
+{
+ /*
+ * A dummy post handler is required to prohibit optimizing, because
+ * jump optimization does not support execution path overriding.
+ */
+}
+
struct fei_attr {
struct list_head list;
struct kprobe kp;
@@ -56,6 +65,7 @@ static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)
return NULL;
}
attr->kp.pre_handler = fei_kprobe_handler;
+ attr->kp.post_handler = fei_post_handler;
attr->retval = adjust_error_retval(addr, 0);
INIT_LIST_HEAD(&attr->list);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index e5d9d405ae4e..80b48a8fb47b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -216,10 +216,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
-#ifdef CONFIG_DEBUG_KMEMLEAK
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
-#endif
+
tsk->stack_vm_area = s;
return s->addr;
}
@@ -595,6 +594,8 @@ static void check_mm(struct mm_struct *mm)
void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
+ WARN_ON_ONCE(mm == current->mm);
+ WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
hmm_mm_destroy(mm);
@@ -1198,8 +1199,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* not set up a proper pointer then tough luck.
*/
put_user(0, tsk->clear_child_tid);
- sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
- 1, NULL, NULL, 0);
+ do_futex(tsk->clear_child_tid, FUTEX_WAKE,
+ 1, NULL, NULL, 0, 0);
}
tsk->clear_child_tid = NULL;
}
@@ -1711,7 +1712,7 @@ static __latent_entropy struct task_struct *copy_process(
p->start_time = ktime_get_ns();
p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
- p->audit_context = NULL;
+ audit_set_context(p, NULL);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -2354,7 +2355,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
* constructed. Here we are modifying the current, active,
* task_struct.
*/
-SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
+int ksys_unshare(unsigned long unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
struct files_struct *fd, *new_fd = NULL;
@@ -2470,6 +2471,11 @@ bad_unshare_out:
return err;
}
+SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
+{
+ return ksys_unshare(unshare_flags);
+}
+
/*
* Helper to unshare the files of the current task.
* We don't want to expose copy_files internals to
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 6fc87ccda1d7..c6766f326072 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -132,3 +132,9 @@ config GENERIC_IRQ_DEBUGFS
If you don't know what to do here, say N.
endmenu
+
+config GENERIC_IRQ_MULTI_HANDLER
+ depends on !MULTI_IRQ_HANDLER
+ bool
+ help
+ Allow to specify the low level IRQ handler at run time.
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index a37a3b4b6342..f4f29b9d90ee 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
}
}
-static cpumask_var_t *alloc_node_to_possible_cpumask(void)
+static cpumask_var_t *alloc_node_to_cpumask(void)
{
cpumask_var_t *masks;
int node;
@@ -62,7 +62,7 @@ out_unwind:
return NULL;
}
-static void free_node_to_possible_cpumask(cpumask_var_t *masks)
+static void free_node_to_cpumask(cpumask_var_t *masks)
{
int node;
@@ -71,7 +71,7 @@ static void free_node_to_possible_cpumask(cpumask_var_t *masks)
kfree(masks);
}
-static void build_node_to_possible_cpumask(cpumask_var_t *masks)
+static void build_node_to_cpumask(cpumask_var_t *masks)
{
int cpu;
@@ -79,14 +79,14 @@ static void build_node_to_possible_cpumask(cpumask_var_t *masks)
cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
}
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
const struct cpumask *mask, nodemask_t *nodemsk)
{
int n, nodes = 0;
/* Calculate the number of nodes in the supplied affinity mask */
for_each_node(n) {
- if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
+ if (cpumask_intersects(mask, node_to_cpumask[n])) {
node_set(n, *nodemsk);
nodes++;
}
@@ -94,73 +94,46 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
return nodes;
}
-/**
- * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
- * @nvecs: The total number of vectors
- * @affd: Description of the affinity requirements
- *
- * Returns the masks pointer or NULL if allocation failed.
- */
-struct cpumask *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+static int irq_build_affinity_masks(const struct irq_affinity *affd,
+ int startvec, int numvecs,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ struct cpumask *nmsk,
+ struct cpumask *masks)
{
- int n, nodes, cpus_per_vec, extra_vecs, curvec;
- int affv = nvecs - affd->pre_vectors - affd->post_vectors;
- int last_affv = affv + affd->pre_vectors;
+ int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+ int last_affv = affd->pre_vectors + numvecs;
+ int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
- struct cpumask *masks;
- cpumask_var_t nmsk, *node_to_possible_cpumask;
-
- /*
- * If there aren't any vectors left after applying the pre/post
- * vectors don't bother with assigning affinity.
- */
- if (!affv)
- return NULL;
-
- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return NULL;
-
- masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
- if (!masks)
- goto out;
- node_to_possible_cpumask = alloc_node_to_possible_cpumask();
- if (!node_to_possible_cpumask)
- goto out;
+ if (!cpumask_weight(cpu_mask))
+ return 0;
- /* Fill out vectors at the beginning that don't need affinity */
- for (curvec = 0; curvec < affd->pre_vectors; curvec++)
- cpumask_copy(masks + curvec, irq_default_affinity);
-
- /* Stabilize the cpumasks */
- get_online_cpus();
- build_node_to_possible_cpumask(node_to_possible_cpumask);
- nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
- &nodemsk);
+ nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
/*
* If the number of nodes in the mask is greater than or equal the
* number of vectors we just spread the vectors across the nodes.
*/
- if (affv <= nodes) {
+ if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
- cpumask_copy(masks + curvec,
- node_to_possible_cpumask[n]);
- if (++curvec == last_affv)
+ cpumask_copy(masks + curvec, node_to_cpumask[n]);
+ if (++done == numvecs)
break;
+ if (++curvec == last_affv)
+ curvec = affd->pre_vectors;
}
- goto done;
+ goto out;
}
for_each_node_mask(n, nodemsk) {
int ncpus, v, vecs_to_assign, vecs_per_node;
/* Spread the vectors per node */
- vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
+ vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes;
/* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
@@ -181,19 +154,96 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
}
- if (curvec >= last_affv)
+ done += v;
+ if (done >= numvecs)
break;
+ if (curvec >= last_affv)
+ curvec = affd->pre_vectors;
--nodes;
}
-done:
+out:
+ return done;
+}
+
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @nvecs: The total number of vectors
+ * @affd: Description of the affinity requirements
+ *
+ * Returns the masks pointer or NULL if allocation failed.
+ */
+struct cpumask *
+irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+{
+ int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+ int curvec, usedvecs;
+ cpumask_var_t nmsk, npresmsk, *node_to_cpumask;
+ struct cpumask *masks = NULL;
+
+ /*
+ * If there aren't any vectors left after applying the pre/post
+ * vectors don't bother with assigning affinity.
+ */
+ if (nvecs == affd->pre_vectors + affd->post_vectors)
+ return NULL;
+
+ if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+ return NULL;
+
+ if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
+ goto outcpumsk;
+
+ node_to_cpumask = alloc_node_to_cpumask();
+ if (!node_to_cpumask)
+ goto outnpresmsk;
+
+ masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ goto outnodemsk;
+
+ /* Fill out vectors at the beginning that don't need affinity */
+ for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+ cpumask_copy(masks + curvec, irq_default_affinity);
+
+ /* Stabilize the cpumasks */
+ get_online_cpus();
+ build_node_to_cpumask(node_to_cpumask);
+
+ /* Spread on present CPUs starting from affd->pre_vectors */
+ usedvecs = irq_build_affinity_masks(affd, curvec, affvecs,
+ node_to_cpumask, cpu_present_mask,
+ nmsk, masks);
+
+ /*
+ * Spread on non present CPUs starting from the next vector to be
+ * handled. If the spreading of present CPUs already exhausted the
+ * vector space, assign the non present CPUs to the already spread
+ * out vectors.
+ */
+ if (usedvecs >= affvecs)
+ curvec = affd->pre_vectors;
+ else
+ curvec = affd->pre_vectors + usedvecs;
+ cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
+ usedvecs += irq_build_affinity_masks(affd, curvec, affvecs,
+ node_to_cpumask, npresmsk,
+ nmsk, masks);
put_online_cpus();
/* Fill out vectors at the end that don't need affinity */
+ if (usedvecs >= affvecs)
+ curvec = affd->pre_vectors + affvecs;
+ else
+ curvec = affd->pre_vectors + usedvecs;
for (; curvec < nvecs; curvec++)
cpumask_copy(masks + curvec, irq_default_affinity);
- free_node_to_possible_cpumask(node_to_possible_cpumask);
-out:
+
+outnodemsk:
+ free_node_to_cpumask(node_to_cpumask);
+outnpresmsk:
+ free_cpumask_var(npresmsk);
+outcpumsk:
free_cpumask_var(nmsk);
return masks;
}
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 8c82ea26e837..16cbf6beb276 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/autoprobe.c
- *
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the interrupt probing code and driver APIs.
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c69357a43849..a2b3d9de999c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1,13 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/chip.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
- * This file contains the core interrupt handling code, for irq-chip
- * based architectures.
- *
- * Detailed information is available in Documentation/core-api/genericirq.rst
+ * This file contains the core interrupt handling code, for irq-chip based
+ * architectures. Detailed information is available in
+ * Documentation/core-api/genericirq.rst
*/
#include <linux/irq.h>
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 9eb09aef0313..5b1072e394b2 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Generic cpu hotunplug interrupt migration code copied from the
* arch/arm implementation
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index acfaaef8672a..4dadeb3d6666 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -1,8 +1,6 @@
-/*
- * Copyright 2017 Thomas Gleixner <tglx@linutronix.de>
- *
- * This file is licensed under the GPL V2.
- */
+// SPDX-License-Identifier: GPL-2.0
+// Copyright 2017 Thomas Gleixner <tglx@linutronix.de>
+
#include <linux/irqdomain.h>
#include <linux/irq.h>
#include <linux/uaccess.h>
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 194c506d9d20..6a682c229e10 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/device.h>
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 326a67f2410b..0b0cdf206dc4 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 508c03dfef25..e2999a070a99 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Library implementing the most common irq chip callback functions
*
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 79f987b942b8..38554bc35375 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/handle.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
- * This file contains the core interrupt handling code.
- *
- * Detailed information is available in Documentation/core-api/genericirq.rst
+ * This file contains the core interrupt handling code. Detailed
+ * information is available in Documentation/core-api/genericirq.rst
*
*/
@@ -20,6 +18,10 @@
#include "internals.h"
+#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
+void (*handle_arch_irq)(struct pt_regs *) __ro_after_init;
+#endif
+
/**
* handle_bad_irq - handle spurious and unhandled irqs
* @desc: description of the interrupt
@@ -207,3 +209,14 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
return ret;
}
+
+#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
+int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
+{
+ if (handle_arch_irq)
+ return -EBUSY;
+
+ handle_arch_irq = handle_irq;
+ return 0;
+}
+#endif
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 259a22aa9934..8b778e37dc6d 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/ipi.c
- *
* Copyright (C) 2015 Imagination Technologies Ltd
* Author: Qais Yousef <qais.yousef@imgtec.com>
*
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 24caabf1a0f7..dd20d0d528d4 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
+ * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl>
*/
+#include <linux/slab.h>
#include <linux/irq_sim.h>
#include <linux/irq.h>
@@ -49,7 +46,8 @@ static void irq_sim_handle_irq(struct irq_work *work)
* @sim: The interrupt simulator object to initialize.
* @num_irqs: Number of interrupts to allocate
*
- * Returns 0 on success and a negative error number on failure.
+ * On success: return the base of the allocated interrupt range.
+ * On failure: a negative errno.
*/
int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
{
@@ -78,7 +76,7 @@ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq);
sim->irq_count = num_irqs;
- return 0;
+ return sim->irq_base;
}
EXPORT_SYMBOL_GPL(irq_sim_init);
@@ -110,7 +108,8 @@ static void devm_irq_sim_release(struct device *dev, void *res)
* @sim: The interrupt simulator object to initialize.
* @num_irqs: Number of interrupts to allocate
*
- * Returns 0 on success and a negative error number on failure.
+ * On success: return the base of the allocated interrupt range.
+ * On failure: a negative errno.
*/
int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
unsigned int num_irqs)
@@ -123,7 +122,7 @@ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
return -ENOMEM;
rv = irq_sim_init(sim, num_irqs);
- if (rv) {
+ if (rv < 0) {
devres_free(dr);
return rv;
}
@@ -131,7 +130,7 @@ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
dr->sim = sim;
devres_add(dev, dr);
- return 0;
+ return rv;
}
EXPORT_SYMBOL_GPL(devm_irq_sim_init);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 49b54e9979cc..afc7f902d74a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -1,10 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
- * This file contains the interrupt descriptor management code
- *
- * Detailed information is available in Documentation/core-api/genericirq.rst
+ * This file contains the interrupt descriptor management code. Detailed
+ * information is available in Documentation/core-api/genericirq.rst
*
*/
#include <linux/irq.h>
@@ -210,6 +210,22 @@ static ssize_t type_show(struct kobject *kobj,
}
IRQ_ATTR_RO(type);
+static ssize_t wakeup_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+
+ raw_spin_lock_irq(&desc->lock);
+ ret = sprintf(buf, "%s\n",
+ irqd_is_wakeup_set(&desc->irq_data) ? "enabled" : "disabled");
+ raw_spin_unlock_irq(&desc->lock);
+
+ return ret;
+
+}
+IRQ_ATTR_RO(wakeup);
+
static ssize_t name_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -253,6 +269,7 @@ static struct attribute *irq_attrs[] = {
&chip_name_attr.attr,
&hwirq_attr.attr,
&type_attr.attr,
+ &wakeup_attr.attr,
&name_attr.attr,
&actions_attr.attr,
NULL
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 82b8b18ee1eb..5d9fc01b60a6 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
#define pr_fmt(fmt) "irq: " fmt
#include <linux/acpi.h>
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0f922729bab9..e3336d904f64 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/manage.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006 Thomas Gleixner
*
@@ -855,10 +854,14 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
* This code is triggered unconditionally. Check the affinity
* mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
*/
- if (cpumask_available(desc->irq_common_data.affinity))
- cpumask_copy(mask, desc->irq_common_data.affinity);
- else
+ if (cpumask_available(desc->irq_common_data.affinity)) {
+ const struct cpumask *m;
+
+ m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+ cpumask_copy(mask, m);
+ } else {
valid = false;
+ }
raw_spin_unlock_irq(&desc->lock);
if (valid)
@@ -1519,9 +1522,9 @@ EXPORT_SYMBOL_GPL(setup_irq);
* Internal function to unregister an irqaction - used to free
* regular and special interrupts that are part of the architecture.
*/
-static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
+static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ unsigned irq = desc->irq_data.irq;
struct irqaction *action, **action_ptr;
unsigned long flags;
@@ -1651,7 +1654,7 @@ void remove_irq(unsigned int irq, struct irqaction *act)
struct irq_desc *desc = irq_to_desc(irq);
if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
- __free_irq(irq, act->dev_id);
+ __free_irq(desc, act->dev_id);
}
EXPORT_SYMBOL_GPL(remove_irq);
@@ -1685,7 +1688,7 @@ const void *free_irq(unsigned int irq, void *dev_id)
desc->affinity_notify = NULL;
#endif
- action = __free_irq(irq, dev_id);
+ action = __free_irq(desc, dev_id);
if (!action)
return NULL;
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 4c5770407031..5092494bf261 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -1,8 +1,6 @@
-/*
- * Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de>
- *
- * SPDX-License-Identifier: GPL-2.0
- */
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de>
+
#include <linux/spinlock.h>
#include <linux/seq_file.h>
#include <linux/bitmap.h>
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2f3c4f5382cc..4ca2fd46645d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/msi.c
- *
* Copyright (C) 2014 Intel Corp.
* Author: Jiang Liu <jiang.liu@linux.intel.com>
*
@@ -77,6 +76,19 @@ static inline void irq_chip_write_msi_msg(struct irq_data *data,
data->chip->irq_write_msi_msg(data, msg);
}
+static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * If the MSI provider has messed with the second message and
+ * not advertized that it is level-capable, signal the breakage.
+ */
+ WARN_ON(!((info->flags & MSI_FLAG_LEVEL_CAPABLE) &&
+ (info->chip->flags & IRQCHIP_SUPPORTS_LEVEL_MSI)) &&
+ (msg[1].address_lo || msg[1].address_hi || msg[1].data));
+}
+
/**
* msi_domain_set_affinity - Generic affinity setter function for MSI domains
* @irq_data: The irq data associated to the interrupt
@@ -90,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
const struct cpumask *mask, bool force)
{
struct irq_data *parent = irq_data->parent_data;
- struct msi_msg msg;
+ struct msi_msg msg[2] = { [1] = { }, };
int ret;
ret = parent->chip->irq_set_affinity(parent, mask, force);
if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
- BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
- irq_chip_write_msi_msg(irq_data, &msg);
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
+ msi_check_level(irq_data->domain, msg);
+ irq_chip_write_msi_msg(irq_data, msg);
}
return ret;
@@ -105,20 +118,21 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
static int msi_domain_activate(struct irq_domain *domain,
struct irq_data *irq_data, bool early)
{
- struct msi_msg msg;
+ struct msi_msg msg[2] = { [1] = { }, };
- BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
- irq_chip_write_msi_msg(irq_data, &msg);
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
+ msi_check_level(irq_data->domain, msg);
+ irq_chip_write_msi_msg(irq_data, msg);
return 0;
}
static void msi_domain_deactivate(struct irq_domain *domain,
struct irq_data *irq_data)
{
- struct msi_msg msg;
+ struct msi_msg msg[2];
- memset(&msg, 0, sizeof(msg));
- irq_chip_write_msi_msg(irq_data, &msg);
+ memset(msg, 0, sizeof(msg));
+ irq_chip_write_msi_msg(irq_data, msg);
}
static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 6bd9b58429cc..d6961d3c6f9e 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/pm.c
- *
* Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
*
* This file contains power management functions related to interrupts.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index e8f374971e37..37eda10f5c36 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/proc.c
- *
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the /proc/irq/ handling code.
@@ -187,11 +185,6 @@ static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
}
-static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
-}
-
static const struct file_operations irq_affinity_proc_fops = {
.open = irq_affinity_proc_open,
.read = seq_read,
@@ -200,13 +193,6 @@ static const struct file_operations irq_affinity_proc_fops = {
.write = irq_affinity_proc_write,
};
-static const struct file_operations irq_affinity_hint_proc_fops = {
- .open = irq_affinity_hint_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static const struct file_operations irq_affinity_list_proc_fops = {
.open = irq_affinity_list_proc_open,
.read = seq_read,
@@ -225,32 +211,6 @@ static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v)
{
return show_irq_affinity(EFFECTIVE_LIST, m);
}
-
-static int irq_effective_aff_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode));
-}
-
-static int irq_effective_aff_list_proc_open(struct inode *inode,
- struct file *file)
-{
- return single_open(file, irq_effective_aff_list_proc_show,
- PDE_DATA(inode));
-}
-
-static const struct file_operations irq_effective_aff_proc_fops = {
- .open = irq_effective_aff_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static const struct file_operations irq_effective_aff_list_proc_fops = {
- .open = irq_effective_aff_list_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif
static int default_affinity_show(struct seq_file *m, void *v)
@@ -315,18 +275,6 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
seq_printf(m, "%d\n", irq_desc_get_node(desc));
return 0;
}
-
-static int irq_node_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, irq_node_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_node_proc_fops = {
- .open = irq_node_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif
static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -339,18 +287,6 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int irq_spurious_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_spurious_proc_fops = {
- .open = irq_spurious_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
#define MAX_NAMELEN 128
static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -423,24 +359,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
&irq_affinity_proc_fops, irqp);
/* create /proc/irq/<irq>/affinity_hint */
- proc_create_data("affinity_hint", 0444, desc->dir,
- &irq_affinity_hint_proc_fops, irqp);
+ proc_create_single_data("affinity_hint", 0444, desc->dir,
+ irq_affinity_hint_proc_show, irqp);
/* create /proc/irq/<irq>/smp_affinity_list */
proc_create_data("smp_affinity_list", 0644, desc->dir,
&irq_affinity_list_proc_fops, irqp);
- proc_create_data("node", 0444, desc->dir,
- &irq_node_proc_fops, irqp);
+ proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show,
+ irqp);
# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
- proc_create_data("effective_affinity", 0444, desc->dir,
- &irq_effective_aff_proc_fops, irqp);
- proc_create_data("effective_affinity_list", 0444, desc->dir,
- &irq_effective_aff_list_proc_fops, irqp);
+ proc_create_single_data("effective_affinity", 0444, desc->dir,
+ irq_effective_aff_proc_show, irqp);
+ proc_create_single_data("effective_affinity_list", 0444, desc->dir,
+ irq_effective_aff_list_proc_show, irqp);
# endif
#endif
- proc_create_data("spurious", 0444, desc->dir,
- &irq_spurious_proc_fops, (void *)(long)irq);
+ proc_create_single_data("spurious", 0444, desc->dir,
+ irq_spurious_proc_show, (void *)(long)irq);
out_unlock:
mutex_unlock(&register_lock);
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 1d08f45135c2..95414ad3506a 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/resend.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner
*
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 6cdecc6f4c53..d867d6ddafdd 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/spurious.c
- *
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains spurious interrupt handling.
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index e0923fa4927a..1e4cb63a5c82 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -1,13 +1,6 @@
-/*
- * linux/kernel/irq/timings.c
- *
- * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
+
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/slab.h>
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index e7214093dcd1..01ebdf1f9f40 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -16,6 +16,7 @@
#include <linux/jump_label_ratelimit.h>
#include <linux/bug.h>
#include <linux/cpu.h>
+#include <asm/sections.h>
#ifdef HAVE_JUMP_LABEL
@@ -421,15 +422,15 @@ void __init jump_label_init(void)
cpus_read_unlock();
}
-/* Disable any jump label entries in __init code */
-void __init jump_label_invalidate_init(void)
+/* Disable any jump label entries in __init/__exit code */
+void __init jump_label_invalidate_initmem(void)
{
struct jump_entry *iter_start = __start___jump_table;
struct jump_entry *iter_stop = __stop___jump_table;
struct jump_entry *iter;
for (iter = iter_start; iter < iter_stop; iter++) {
- if (init_kernel_text(iter->code))
+ if (init_section_contains((void *)(unsigned long)iter->code, 1))
iter->code = 0;
}
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index e62ec4dc6620..aed8fb2564b3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -192,11 +192,9 @@ out:
* that to happen you need to do that yourself.
*/
-SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
- struct kexec_segment __user *, segments, unsigned long, flags)
+static inline int kexec_load_check(unsigned long nr_segments,
+ unsigned long flags)
{
- int result;
-
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
@@ -208,17 +206,29 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
return -EINVAL;
- /* Verify we are on the appropriate architecture */
- if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
- ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
- return -EINVAL;
-
/* Put an artificial cap on the number
* of segments passed to kexec_load.
*/
if (nr_segments > KEXEC_SEGMENT_MAX)
return -EINVAL;
+ return 0;
+}
+
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+ struct kexec_segment __user *, segments, unsigned long, flags)
+{
+ int result;
+
+ result = kexec_load_check(nr_segments, flags);
+ if (result)
+ return result;
+
+ /* Verify we are on the appropriate architecture */
+ if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+ ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+ return -EINVAL;
+
/* Because we write directly to the reserved memory
* region when loading crash kernels we need a mutex here to
* prevent multiple crash kernels from attempting to load
@@ -247,15 +257,16 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
struct kexec_segment out, __user *ksegments;
unsigned long i, result;
+ result = kexec_load_check(nr_segments, flags);
+ if (result)
+ return result;
+
/* Don't allow clients that don't understand the native
* architecture to do anything.
*/
if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
return -EINVAL;
- if (nr_segments > KEXEC_SEGMENT_MAX)
- return -EINVAL;
-
ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
for (i = 0; i < nr_segments; i++) {
result = copy_from_user(&in, &segments[i], sizeof(in));
@@ -272,6 +283,21 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
return -EFAULT;
}
- return sys_kexec_load(entry, nr_segments, ksegments, flags);
+ /* Because we write directly to the reserved memory
+ * region when loading crash kernels we need a mutex here to
+ * prevent multiple crash kernels from attempting to load
+ * simultaneously, and to prevent a crash kernel from loading
+ * over the top of a in use crash kernel.
+ *
+ * KISS: always take the mutex.
+ */
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ result = do_kexec_load(entry, nr_segments, ksegments, flags);
+
+ mutex_unlock(&kexec_mutex);
+
+ return result;
}
#endif
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index e5bcd94c1efb..75d8e7cf040e 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -22,50 +22,123 @@
#include <linux/ima.h>
#include <crypto/hash.h>
#include <crypto/sha.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
#include "kexec_internal.h"
static int kexec_calculate_store_digests(struct kimage *image);
+/*
+ * Currently this is the only default function that is exported as some
+ * architectures need it to do additional handlings.
+ * In the future, other default functions may be exported too if required.
+ */
+int kexec_image_probe_default(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ const struct kexec_file_ops * const *fops;
+ int ret = -ENOEXEC;
+
+ for (fops = &kexec_file_loaders[0]; *fops && (*fops)->probe; ++fops) {
+ ret = (*fops)->probe(buf, buf_len);
+ if (!ret) {
+ image->fops = *fops;
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
/* Architectures can provide this probe function */
int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
unsigned long buf_len)
{
- return -ENOEXEC;
+ return kexec_image_probe_default(image, buf, buf_len);
+}
+
+static void *kexec_image_load_default(struct kimage *image)
+{
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
}
void * __weak arch_kexec_kernel_image_load(struct kimage *image)
{
- return ERR_PTR(-ENOEXEC);
+ return kexec_image_load_default(image);
+}
+
+static int kexec_image_post_load_cleanup_default(struct kimage *image)
+{
+ if (!image->fops || !image->fops->cleanup)
+ return 0;
+
+ return image->fops->cleanup(image->image_loader_data);
}
int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
{
- return -EINVAL;
+ return kexec_image_post_load_cleanup_default(image);
}
#ifdef CONFIG_KEXEC_VERIFY_SIG
+static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.\n");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(buf, buf_len);
+}
+
int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
unsigned long buf_len)
{
- return -EKEYREJECTED;
+ return kexec_image_verify_sig_default(image, buf, buf_len);
}
#endif
-/* Apply relocations of type RELA */
+/*
+ * arch_kexec_apply_relocations_add - apply relocations of type RELA
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELAs.
+ * @symtab: Corresponding symtab.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
+arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
+ const Elf_Shdr *relsec, const Elf_Shdr *symtab)
{
pr_err("RELA relocation unsupported.\n");
return -ENOEXEC;
}
-/* Apply relocations of type REL */
+/*
+ * arch_kexec_apply_relocations - apply relocations of type REL
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELs.
+ * @symtab: Corresponding symtab.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
+arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
+ const Elf_Shdr *relsec, const Elf_Shdr *symtab)
{
pr_err("REL relocation unsupported.\n");
return -ENOEXEC;
@@ -532,6 +605,9 @@ static int kexec_calculate_store_digests(struct kimage *image)
struct kexec_sha_region *sha_regions;
struct purgatory_info *pi = &image->purgatory_info;
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+ return 0;
+
zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
zero_buf_sz = PAGE_SIZE;
@@ -633,87 +709,29 @@ out:
return ret;
}
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down)
+#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
+/*
+ * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
+ * @pi: Purgatory to be loaded.
+ * @kbuf: Buffer to setup.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi,
+ struct kexec_buf *kbuf)
{
- struct purgatory_info *pi = &image->purgatory_info;
- unsigned long align, bss_align, bss_sz, bss_pad;
- unsigned long entry, load_addr, curr_load_addr, bss_addr, offset;
- unsigned char *buf_addr, *src;
- int i, ret = 0, entry_sidx = -1;
- const Elf_Shdr *sechdrs_c;
- Elf_Shdr *sechdrs = NULL;
- struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
- .buf_min = min, .buf_max = max,
- .top_down = top_down };
-
- /*
- * sechdrs_c points to section headers in purgatory and are read
- * only. No modifications allowed.
- */
- sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
- /*
- * We can not modify sechdrs_c[] and its fields. It is read only.
- * Copy it over to a local copy where one can store some temporary
- * data and free it at the end. We need to modify ->sh_addr and
- * ->sh_offset fields to keep track of permanent and temporary
- * locations of sections.
- */
- sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
- if (!sechdrs)
- return -ENOMEM;
-
- memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
- /*
- * We seem to have multiple copies of sections. First copy is which
- * is embedded in kernel in read only section. Some of these sections
- * will be copied to a temporary buffer and relocated. And these
- * sections will finally be copied to their final destination at
- * segment load time.
- *
- * Use ->sh_offset to reflect section address in memory. It will
- * point to original read only copy if section is not allocatable.
- * Otherwise it will point to temporary copy which will be relocated.
- *
- * Use ->sh_addr to contain final address of the section where it
- * will go during execution time.
- */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type == SHT_NOBITS)
- continue;
-
- sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
- sechdrs[i].sh_offset;
- }
-
- /*
- * Identify entry point section and make entry relative to section
- * start.
- */
- entry = pi->ehdr->e_entry;
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- /* Make entry section relative */
- if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
- ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
- pi->ehdr->e_entry)) {
- entry_sidx = i;
- entry -= sechdrs[i].sh_addr;
- break;
- }
- }
+ const Elf_Shdr *sechdrs;
+ unsigned long bss_align;
+ unsigned long bss_sz;
+ unsigned long align;
+ int i, ret;
- /* Determine how much memory is needed to load relocatable object. */
- bss_align = 1;
- bss_sz = 0;
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ kbuf->buf_align = bss_align = 1;
+ kbuf->bufsz = bss_sz = 0;
for (i = 0; i < pi->ehdr->e_shnum; i++) {
if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -721,111 +739,124 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
align = sechdrs[i].sh_addralign;
if (sechdrs[i].sh_type != SHT_NOBITS) {
- if (kbuf.buf_align < align)
- kbuf.buf_align = align;
- kbuf.bufsz = ALIGN(kbuf.bufsz, align);
- kbuf.bufsz += sechdrs[i].sh_size;
+ if (kbuf->buf_align < align)
+ kbuf->buf_align = align;
+ kbuf->bufsz = ALIGN(kbuf->bufsz, align);
+ kbuf->bufsz += sechdrs[i].sh_size;
} else {
- /* bss section */
if (bss_align < align)
bss_align = align;
bss_sz = ALIGN(bss_sz, align);
bss_sz += sechdrs[i].sh_size;
}
}
+ kbuf->bufsz = ALIGN(kbuf->bufsz, bss_align);
+ kbuf->memsz = kbuf->bufsz + bss_sz;
+ if (kbuf->buf_align < bss_align)
+ kbuf->buf_align = bss_align;
- /* Determine the bss padding required to align bss properly */
- bss_pad = 0;
- if (kbuf.bufsz & (bss_align - 1))
- bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1));
-
- kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz;
+ kbuf->buffer = vzalloc(kbuf->bufsz);
+ if (!kbuf->buffer)
+ return -ENOMEM;
+ pi->purgatory_buf = kbuf->buffer;
- /* Allocate buffer for purgatory */
- kbuf.buffer = vzalloc(kbuf.bufsz);
- if (!kbuf.buffer) {
- ret = -ENOMEM;
+ ret = kexec_add_buffer(kbuf);
+ if (ret)
goto out;
- }
- if (kbuf.buf_align < bss_align)
- kbuf.buf_align = bss_align;
+ return 0;
+out:
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+ return ret;
+}
- /* Add buffer to segment list */
- ret = kexec_add_buffer(&kbuf);
- if (ret)
- goto out;
- pi->purgatory_load_addr = kbuf.mem;
+/*
+ * kexec_purgatory_setup_sechdrs - prepares the pi->sechdrs buffer.
+ * @pi: Purgatory to be loaded.
+ * @kbuf: Buffer prepared to store purgatory.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
+ struct kexec_buf *kbuf)
+{
+ unsigned long bss_addr;
+ unsigned long offset;
+ Elf_Shdr *sechdrs;
+ int i;
+
+ /*
+ * The section headers in kexec_purgatory are read-only. In order to
+ * have them modifiable make a temporary copy.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+ memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff,
+ pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ pi->sechdrs = sechdrs;
- /* Load SHF_ALLOC sections */
- buf_addr = kbuf.buffer;
- load_addr = curr_load_addr = pi->purgatory_load_addr;
- bss_addr = load_addr + kbuf.bufsz + bss_pad;
+ offset = 0;
+ bss_addr = kbuf->mem + kbuf->bufsz;
+ kbuf->image->start = pi->ehdr->e_entry;
for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ unsigned long align;
+ void *src, *dst;
+
if (!(sechdrs[i].sh_flags & SHF_ALLOC))
continue;
align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- curr_load_addr = ALIGN(curr_load_addr, align);
- offset = curr_load_addr - load_addr;
- /* We already modifed ->sh_offset to keep src addr */
- src = (char *) sechdrs[i].sh_offset;
- memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
- /* Store load address and source address of section */
- sechdrs[i].sh_addr = curr_load_addr;
-
- /*
- * This section got copied to temporary buffer. Update
- * ->sh_offset accordingly.
- */
- sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
- /* Advance to the next address */
- curr_load_addr += sechdrs[i].sh_size;
- } else {
+ if (sechdrs[i].sh_type == SHT_NOBITS) {
bss_addr = ALIGN(bss_addr, align);
sechdrs[i].sh_addr = bss_addr;
bss_addr += sechdrs[i].sh_size;
+ continue;
}
- }
- /* Update entry point based on load address of text section */
- if (entry_sidx >= 0)
- entry += sechdrs[entry_sidx].sh_addr;
+ offset = ALIGN(offset, align);
+ if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
+ pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
+ pi->ehdr->e_entry < (sechdrs[i].sh_addr
+ + sechdrs[i].sh_size)) {
+ kbuf->image->start -= sechdrs[i].sh_addr;
+ kbuf->image->start += kbuf->mem + offset;
+ }
- /* Make kernel jump to purgatory after shutdown */
- image->start = entry;
+ src = (void *)pi->ehdr + sechdrs[i].sh_offset;
+ dst = pi->purgatory_buf + offset;
+ memcpy(dst, src, sechdrs[i].sh_size);
- /* Used later to get/set symbol values */
- pi->sechdrs = sechdrs;
+ sechdrs[i].sh_addr = kbuf->mem + offset;
+ sechdrs[i].sh_offset = offset;
+ offset += sechdrs[i].sh_size;
+ }
- /*
- * Used later to identify which section is purgatory and skip it
- * from checksumming.
- */
- pi->purgatory_buf = kbuf.buffer;
- return ret;
-out:
- vfree(sechdrs);
- vfree(kbuf.buffer);
- return ret;
+ return 0;
}
static int kexec_apply_relocations(struct kimage *image)
{
int i, ret;
struct purgatory_info *pi = &image->purgatory_info;
- Elf_Shdr *sechdrs = pi->sechdrs;
+ const Elf_Shdr *sechdrs;
+
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
- /* Apply relocations */
for (i = 0; i < pi->ehdr->e_shnum; i++) {
- Elf_Shdr *section, *symtab;
+ const Elf_Shdr *relsec;
+ const Elf_Shdr *symtab;
+ Elf_Shdr *section;
+
+ relsec = sechdrs + i;
- if (sechdrs[i].sh_type != SHT_RELA &&
- sechdrs[i].sh_type != SHT_REL)
+ if (relsec->sh_type != SHT_RELA &&
+ relsec->sh_type != SHT_REL)
continue;
/*
@@ -834,12 +865,12 @@ static int kexec_apply_relocations(struct kimage *image)
* symbol table. And ->sh_info contains section header
* index of section to which relocations apply.
*/
- if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
- sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ if (relsec->sh_info >= pi->ehdr->e_shnum ||
+ relsec->sh_link >= pi->ehdr->e_shnum)
return -ENOEXEC;
- section = &sechdrs[sechdrs[i].sh_info];
- symtab = &sechdrs[sechdrs[i].sh_link];
+ section = pi->sechdrs + relsec->sh_info;
+ symtab = sechdrs + relsec->sh_link;
if (!(section->sh_flags & SHF_ALLOC))
continue;
@@ -856,12 +887,12 @@ static int kexec_apply_relocations(struct kimage *image)
* Respective architecture needs to provide support for applying
* relocations of type SHT_RELA/SHT_REL.
*/
- if (sechdrs[i].sh_type == SHT_RELA)
- ret = arch_kexec_apply_relocations_add(pi->ehdr,
- sechdrs, i);
- else if (sechdrs[i].sh_type == SHT_REL)
- ret = arch_kexec_apply_relocations(pi->ehdr,
- sechdrs, i);
+ if (relsec->sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi, section,
+ relsec, symtab);
+ else if (relsec->sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi, section,
+ relsec, symtab);
if (ret)
return ret;
}
@@ -869,10 +900,18 @@ static int kexec_apply_relocations(struct kimage *image)
return 0;
}
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down,
- unsigned long *load_addr)
+/*
+ * kexec_load_purgatory - Load and relocate the purgatory object.
+ * @image: Image to add the purgatory to.
+ * @kbuf: Memory parameters to use.
+ *
+ * Allocates the memory needed for image->purgatory_info.sechdrs and
+ * image->purgatory_info.purgatory_buf/kbuf->buffer. Caller is responsible
+ * to free the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf)
{
struct purgatory_info *pi = &image->purgatory_info;
int ret;
@@ -880,55 +919,51 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
if (kexec_purgatory_size <= 0)
return -EINVAL;
- if (kexec_purgatory_size < sizeof(Elf_Ehdr))
- return -ENOEXEC;
-
- pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
- if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
- || pi->ehdr->e_type != ET_REL
- || !elf_check_arch(pi->ehdr)
- || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
-
- if (pi->ehdr->e_shoff >= kexec_purgatory_size
- || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
- kexec_purgatory_size - pi->ehdr->e_shoff))
- return -ENOEXEC;
+ pi->ehdr = (const Elf_Ehdr *)kexec_purgatory;
- ret = __kexec_load_purgatory(image, min, max, top_down);
+ ret = kexec_purgatory_setup_kbuf(pi, kbuf);
if (ret)
return ret;
+ ret = kexec_purgatory_setup_sechdrs(pi, kbuf);
+ if (ret)
+ goto out_free_kbuf;
+
ret = kexec_apply_relocations(image);
if (ret)
goto out;
- *load_addr = pi->purgatory_load_addr;
return 0;
out:
vfree(pi->sechdrs);
pi->sechdrs = NULL;
-
+out_free_kbuf:
vfree(pi->purgatory_buf);
pi->purgatory_buf = NULL;
return ret;
}
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
- const char *name)
+/*
+ * kexec_purgatory_find_symbol - find a symbol in the purgatory
+ * @pi: Purgatory to search in.
+ * @name: Name of the symbol.
+ *
+ * Return: pointer to symbol in read-only symtab on success, NULL on error.
+ */
+static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
{
- Elf_Sym *syms;
- Elf_Shdr *sechdrs;
- Elf_Ehdr *ehdr;
- int i, k;
+ const Elf_Shdr *sechdrs;
+ const Elf_Ehdr *ehdr;
+ const Elf_Sym *syms;
const char *strtab;
+ int i, k;
- if (!pi->sechdrs || !pi->ehdr)
+ if (!pi->ehdr)
return NULL;
- sechdrs = pi->sechdrs;
ehdr = pi->ehdr;
+ sechdrs = (void *)ehdr + ehdr->e_shoff;
for (i = 0; i < ehdr->e_shnum; i++) {
if (sechdrs[i].sh_type != SHT_SYMTAB)
@@ -937,8 +972,8 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
if (sechdrs[i].sh_link >= ehdr->e_shnum)
/* Invalid strtab section number */
continue;
- strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
- syms = (Elf_Sym *)sechdrs[i].sh_offset;
+ strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (void *)ehdr + sechdrs[i].sh_offset;
/* Go through symbols for a match */
for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
@@ -966,7 +1001,7 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
{
struct purgatory_info *pi = &image->purgatory_info;
- Elf_Sym *sym;
+ const Elf_Sym *sym;
Elf_Shdr *sechdr;
sym = kexec_purgatory_find_symbol(pi, name);
@@ -989,9 +1024,9 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
void *buf, unsigned int size, bool get_value)
{
- Elf_Sym *sym;
- Elf_Shdr *sechdrs;
struct purgatory_info *pi = &image->purgatory_info;
+ const Elf_Sym *sym;
+ Elf_Shdr *sec;
char *sym_buf;
sym = kexec_purgatory_find_symbol(pi, name);
@@ -1004,16 +1039,15 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return -EINVAL;
}
- sechdrs = pi->sechdrs;
+ sec = pi->sechdrs + sym->st_shndx;
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sec->sh_type == SHT_NOBITS) {
pr_err("symbol %s is in a bss section. Cannot %s\n", name,
get_value ? "get" : "set");
return -EINVAL;
}
- sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
- sym->st_value;
+ sym_buf = (char *)pi->purgatory_buf + sec->sh_offset + sym->st_value;
if (get_value)
memcpy((void *)buf, sym_buf, size);
@@ -1022,3 +1056,174 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
return 0;
}
+#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
+
+int crash_exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i, j;
+ unsigned long long start, end;
+ struct crash_mem_range temp_range = {0, 0};
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+
+ if (mstart > end || mend < start)
+ continue;
+
+ /* Truncate any area outside of range */
+ if (mstart < start)
+ mstart = start;
+ if (mend > end)
+ mend = end;
+
+ /* Found completely overlapping range */
+ if (mstart == start && mend == end) {
+ mem->ranges[i].start = 0;
+ mem->ranges[i].end = 0;
+ if (i < mem->nr_ranges - 1) {
+ /* Shift rest of the ranges to left */
+ for (j = i; j < mem->nr_ranges - 1; j++) {
+ mem->ranges[j].start =
+ mem->ranges[j+1].start;
+ mem->ranges[j].end =
+ mem->ranges[j+1].end;
+ }
+ }
+ mem->nr_ranges--;
+ return 0;
+ }
+
+ if (mstart > start && mend < end) {
+ /* Split original range */
+ mem->ranges[i].end = mstart - 1;
+ temp_range.start = mend + 1;
+ temp_range.end = end;
+ } else if (mstart != start)
+ mem->ranges[i].end = mstart - 1;
+ else
+ mem->ranges[i].start = mend + 1;
+ break;
+ }
+
+ /* If a split happened, add the split to array */
+ if (!temp_range.end)
+ return 0;
+
+ /* Split happened */
+ if (i == mem->max_nr_ranges - 1)
+ return -ENOMEM;
+
+ /* Location where new range should go */
+ j = i + 1;
+ if (j < mem->nr_ranges) {
+ /* Move over all ranges one slot towards the end */
+ for (i = mem->nr_ranges - 1; i >= j; i--)
+ mem->ranges[i + 1] = mem->ranges[i];
+ }
+
+ mem->ranges[j].start = temp_range.start;
+ mem->ranges[j].end = temp_range.end;
+ mem->nr_ranges++;
+ return 0;
+}
+
+int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf;
+ unsigned int cpu, i;
+ unsigned long long notes_addr;
+ unsigned long mstart, mend;
+
+ /* extra phdr for vmcoreinfo elf note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += mem->nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two elf headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ ehdr = (Elf64_Ehdr *)buf;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each present cpu */
+ for_each_present_cpu(cpu) {
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ phdr++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+ (ehdr->e_phnum)++;
+ phdr++;
+
+ /* Prepare PT_LOAD type program header for kernel text region */
+ if (kernel_map) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (Elf64_Addr)_text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ ehdr->e_phnum++;
+ phdr++;
+ }
+
+ /* Go through all the ranges in mem->ranges[] and prepare phdr */
+ for (i = 0; i < mem->nr_ranges; i++) {
+ mstart = mem->ranges[i].start;
+ mend = mem->ranges[i].end;
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+ phdr++;
+ pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+ }
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 102160ff5c66..ea619021d901 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2428,7 +2428,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
struct kprobe_blacklist_entry *ent =
list_entry(v, struct kprobe_blacklist_entry, list);
- seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
+ seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr,
(void *)ent->end_addr, (void *)ent->start_addr);
return 0;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index cd50e99202b0..481951bf091d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -55,7 +55,6 @@ enum KTHREAD_BITS {
KTHREAD_IS_PER_CPU = 0,
KTHREAD_SHOULD_STOP,
KTHREAD_SHOULD_PARK,
- KTHREAD_IS_PARKED,
};
static inline void set_kthread_struct(void *kthread)
@@ -177,14 +176,12 @@ void *kthread_probe_data(struct task_struct *task)
static void __kthread_parkme(struct kthread *self)
{
- __set_current_state(TASK_PARKED);
- while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
- if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
- complete(&self->parked);
+ for (;;) {
+ set_current_state(TASK_PARKED);
+ if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
+ break;
schedule();
- __set_current_state(TASK_PARKED);
}
- clear_bit(KTHREAD_IS_PARKED, &self->flags);
__set_current_state(TASK_RUNNING);
}
@@ -194,6 +191,11 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);
+void kthread_park_complete(struct task_struct *k)
+{
+ complete_all(&to_kthread(k)->parked);
+}
+
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
@@ -450,22 +452,16 @@ void kthread_unpark(struct task_struct *k)
{
struct kthread *kthread = to_kthread(k);
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
- * We clear the IS_PARKED bit here as we don't wait
- * until the task has left the park code. So if we'd
- * park before that happens we'd see the IS_PARKED bit
- * which might be about to be cleared.
+ * Newly created kthread was parked when the CPU was offline.
+ * The binding was lost and we need to set it again.
*/
- if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- /*
- * Newly created kthread was parked when the CPU was offline.
- * The binding was lost and we need to set it again.
- */
- if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
- __kthread_bind(k, kthread->cpu, TASK_PARKED);
- wake_up_state(k, TASK_PARKED);
- }
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu, TASK_PARKED);
+
+ reinit_completion(&kthread->parked);
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -488,12 +484,10 @@ int kthread_park(struct task_struct *k)
if (WARN_ON(k->flags & PF_EXITING))
return -ENOSYS;
- if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
- if (k != current) {
- wake_up_process(k);
- wait_for_completion(&kthread->parked);
- }
+ set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ if (k != current) {
+ wake_up_process(k);
+ wait_for_completion(&kthread->parked);
}
return 0;
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index fdac27588d60..83958c814439 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -113,8 +113,10 @@ void *klp_shadow_get(void *obj, unsigned long id)
}
EXPORT_SYMBOL_GPL(klp_shadow_get);
-static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
- size_t size, gfp_t gfp_flags, bool warn_on_exist)
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id,
+ size_t size, gfp_t gfp_flags,
+ klp_shadow_ctor_t ctor, void *ctor_data,
+ bool warn_on_exist)
{
struct klp_shadow *new_shadow;
void *shadow_data;
@@ -125,18 +127,15 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
if (shadow_data)
goto exists;
- /* Allocate a new shadow variable for use inside the lock below */
+ /*
+ * Allocate a new shadow variable. Fill it with zeroes by default.
+ * More complex setting can be done by @ctor function. But it is
+ * called only when the buffer is really used (under klp_shadow_lock).
+ */
new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
if (!new_shadow)
return NULL;
- new_shadow->obj = obj;
- new_shadow->id = id;
-
- /* Initialize the shadow variable if data provided */
- if (data)
- memcpy(new_shadow->data, data, size);
-
/* Look for <obj, id> again under the lock */
spin_lock_irqsave(&klp_shadow_lock, flags);
shadow_data = klp_shadow_get(obj, id);
@@ -150,6 +149,22 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
goto exists;
}
+ new_shadow->obj = obj;
+ new_shadow->id = id;
+
+ if (ctor) {
+ int err;
+
+ err = ctor(obj, new_shadow->data, ctor_data);
+ if (err) {
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+ kfree(new_shadow);
+ pr_err("Failed to construct shadow variable <%p, %lx> (%d)\n",
+ obj, id, err);
+ return NULL;
+ }
+ }
+
/* No <obj, id> found, so attach the newly allocated one */
hash_add_rcu(klp_shadow_hash, &new_shadow->node,
(unsigned long)new_shadow->obj);
@@ -170,26 +185,32 @@ exists:
* klp_shadow_alloc() - allocate and add a new shadow variable
* @obj: pointer to parent object
* @id: data identifier
- * @data: pointer to data to attach to parent
* @size: size of attached data
* @gfp_flags: GFP mask for allocation
+ * @ctor: custom constructor to initialize the shadow data (optional)
+ * @ctor_data: pointer to any data needed by @ctor (optional)
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags.
+ * The data are zeroed by default. They are further initialized by @ctor
+ * function if it is not NULL. The new shadow variable is then added
+ * to the global hashtable.
*
- * Allocates @size bytes for new shadow variable data using @gfp_flags
- * and copies @size bytes from @data into the new shadow variable's own
- * data space. If @data is NULL, @size bytes are still allocated, but
- * no copy is performed. The new shadow variable is then added to the
- * global hashtable.
+ * If an existing <obj, id> shadow variable can be found, this routine will
+ * issue a WARN, exit early and return NULL.
*
- * If an existing <obj, id> shadow variable can be found, this routine
- * will issue a WARN, exit early and return NULL.
+ * This function guarantees that the constructor function is called only when
+ * the variable did not exist before. The cost is that @ctor is called
+ * in atomic context under a spin lock.
*
* Return: the shadow variable data element, NULL on duplicate or
* failure.
*/
-void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
- size_t size, gfp_t gfp_flags)
+void *klp_shadow_alloc(void *obj, unsigned long id,
+ size_t size, gfp_t gfp_flags,
+ klp_shadow_ctor_t ctor, void *ctor_data)
{
- return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+ return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+ ctor, ctor_data, true);
}
EXPORT_SYMBOL_GPL(klp_shadow_alloc);
@@ -197,37 +218,51 @@ EXPORT_SYMBOL_GPL(klp_shadow_alloc);
* klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
* @obj: pointer to parent object
* @id: data identifier
- * @data: pointer to data to attach to parent
* @size: size of attached data
* @gfp_flags: GFP mask for allocation
+ * @ctor: custom constructor to initialize the shadow data (optional)
+ * @ctor_data: pointer to any data needed by @ctor (optional)
*
* Returns a pointer to existing shadow data if an <obj, id> shadow
* variable is already present. Otherwise, it creates a new shadow
* variable like klp_shadow_alloc().
*
- * This function guarantees that only one shadow variable exists with
- * the given @id for the given @obj. It also guarantees that the shadow
- * variable will be initialized by the given @data only when it did not
- * exist before.
+ * This function guarantees that only one shadow variable exists with the given
+ * @id for the given @obj. It also guarantees that the constructor function
+ * will be called only when the variable did not exist before. The cost is
+ * that @ctor is called in atomic context under a spin lock.
*
* Return: the shadow variable data element, NULL on failure.
*/
-void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
- size_t size, gfp_t gfp_flags)
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
+ size_t size, gfp_t gfp_flags,
+ klp_shadow_ctor_t ctor, void *ctor_data)
{
- return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+ return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+ ctor, ctor_data, false);
}
EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
+static void klp_shadow_free_struct(struct klp_shadow *shadow,
+ klp_shadow_dtor_t dtor)
+{
+ hash_del_rcu(&shadow->node);
+ if (dtor)
+ dtor(shadow->obj, shadow->data);
+ kfree_rcu(shadow, rcu_head);
+}
+
/**
* klp_shadow_free() - detach and free a <obj, id> shadow variable
* @obj: pointer to parent object
* @id: data identifier
+ * @dtor: custom callback that can be used to unregister the variable
+ * and/or free data that the shadow variable points to (optional)
*
* This function releases the memory for this <obj, id> shadow variable
* instance, callers should stop referencing it accordingly.
*/
-void klp_shadow_free(void *obj, unsigned long id)
+void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor)
{
struct klp_shadow *shadow;
unsigned long flags;
@@ -239,8 +274,7 @@ void klp_shadow_free(void *obj, unsigned long id)
(unsigned long)obj) {
if (klp_shadow_match(shadow, obj, id)) {
- hash_del_rcu(&shadow->node);
- kfree_rcu(shadow, rcu_head);
+ klp_shadow_free_struct(shadow, dtor);
break;
}
}
@@ -252,11 +286,13 @@ EXPORT_SYMBOL_GPL(klp_shadow_free);
/**
* klp_shadow_free_all() - detach and free all <*, id> shadow variables
* @id: data identifier
+ * @dtor: custom callback that can be used to unregister the variable
+ * and/or free data that the shadow variable points to (optional)
*
* This function releases the memory for all <*, id> shadow variable
* instances, callers should stop referencing them accordingly.
*/
-void klp_shadow_free_all(unsigned long id)
+void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
{
struct klp_shadow *shadow;
unsigned long flags;
@@ -266,10 +302,8 @@ void klp_shadow_free_all(unsigned long id)
/* Delete all <*, id> from hash */
hash_for_each(klp_shadow_hash, i, shadow, node) {
- if (klp_shadow_match(shadow, shadow->obj, id)) {
- hash_del_rcu(&shadow->node);
- kfree_rcu(shadow, rcu_head);
- }
+ if (klp_shadow_match(shadow, shadow->obj, id))
+ klp_shadow_free_struct(shadow, dtor);
}
spin_unlock_irqrestore(&klp_shadow_lock, flags);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 89b5f83f1969..edcac5de7ebc 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -556,25 +556,29 @@ static void print_lock(struct held_lock *hlock)
return;
}
+ printk(KERN_CONT "%p", hlock->instance);
print_lock_name(lock_classes + class_idx - 1);
- printk(KERN_CONT ", at: [<%p>] %pS\n",
- (void *)hlock->acquire_ip, (void *)hlock->acquire_ip);
+ printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
-static void lockdep_print_held_locks(struct task_struct *curr)
+static void lockdep_print_held_locks(struct task_struct *p)
{
- int i, depth = curr->lockdep_depth;
+ int i, depth = READ_ONCE(p->lockdep_depth);
- if (!depth) {
- printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
+ if (!depth)
+ printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
+ else
+ printk("%d lock%s held by %s/%d:\n", depth,
+ depth > 1 ? "s" : "", p->comm, task_pid_nr(p));
+ /*
+ * It's not reliable to print a task's held locks if it's not sleeping
+ * and it's not the current task.
+ */
+ if (p->state == TASK_RUNNING && p != current)
return;
- }
- printk("%d lock%s held by %s/%d:\n",
- depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
-
for (i = 0; i < depth; i++) {
printk(" #%d: ", i);
- print_lock(curr->held_locks + i);
+ print_lock(p->held_locks + i);
}
}
@@ -808,7 +812,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
if (verbose(class)) {
graph_unlock();
- printk("\nnew class %p: %s", class->key, class->name);
+ printk("\nnew class %px: %s", class->key, class->name);
if (class->name_version > 1)
printk(KERN_CONT "#%d", class->name_version);
printk(KERN_CONT "\n");
@@ -1407,7 +1411,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
}
printk("%*s }\n", depth, "");
- printk("%*s ... key at: [<%p>] %pS\n",
+ printk("%*s ... key at: [<%px>] %pS\n",
depth, "", class->key, class->key);
}
@@ -2340,7 +2344,7 @@ cache_hit:
if (very_verbose(class)) {
printk("\nhash chain already cached, key: "
- "%016Lx tail class: [%p] %s\n",
+ "%016Lx tail class: [%px] %s\n",
(unsigned long long)chain_key,
class->key, class->name);
}
@@ -2349,7 +2353,7 @@ cache_hit:
}
if (very_verbose(class)) {
- printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
+ printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n",
(unsigned long long)chain_key, class->key, class->name);
}
@@ -2676,16 +2680,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
void print_irqtrace_events(struct task_struct *curr)
{
printk("irq event stamp: %u\n", curr->irq_events);
- printk("hardirqs last enabled at (%u): [<%p>] %pS\n",
+ printk("hardirqs last enabled at (%u): [<%px>] %pS\n",
curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
(void *)curr->hardirq_enable_ip);
- printk("hardirqs last disabled at (%u): [<%p>] %pS\n",
+ printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
(void *)curr->hardirq_disable_ip);
- printk("softirqs last enabled at (%u): [<%p>] %pS\n",
+ printk("softirqs last enabled at (%u): [<%px>] %pS\n",
curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
(void *)curr->softirq_enable_ip);
- printk("softirqs last disabled at (%u): [<%p>] %pS\n",
+ printk("softirqs last disabled at (%u): [<%px>] %pS\n",
curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
(void *)curr->softirq_disable_ip);
}
@@ -3207,7 +3211,7 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
* Sanity check, the lock-class key must be persistent:
*/
if (!static_obj(key)) {
- printk("BUG: key %p not in .data!\n", key);
+ printk("BUG: key %px not in .data!\n", key);
/*
* What it says above ^^^^^, I suggest you read it.
*/
@@ -3322,7 +3326,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
atomic_inc((atomic_t *)&class->ops);
if (very_verbose(class)) {
- printk("\nacquire class [%p] %s", class->key, class->name);
+ printk("\nacquire class [%px] %s", class->key, class->name);
if (class->name_version > 1)
printk(KERN_CONT "#%d", class->name_version);
printk(KERN_CONT "\n");
@@ -4376,7 +4380,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
pr_warn("WARNING: held lock freed!\n");
print_kernel_ident();
pr_warn("-------------------------\n");
- pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+ pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n",
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
print_lock(hlock);
lockdep_print_held_locks(curr);
@@ -4451,8 +4455,6 @@ EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
void debug_show_all_locks(void)
{
struct task_struct *g, *p;
- int count = 10;
- int unlock = 1;
if (unlikely(!debug_locks)) {
pr_warn("INFO: lockdep is turned off.\n");
@@ -4460,50 +4462,18 @@ void debug_show_all_locks(void)
}
pr_warn("\nShowing all locks held in the system:\n");
- /*
- * Here we try to get the tasklist_lock as hard as possible,
- * if not successful after 2 seconds we ignore it (but keep
- * trying). This is to enable a debug printout even if a
- * tasklist_lock-holding task deadlocks or crashes.
- */
-retry:
- if (!read_trylock(&tasklist_lock)) {
- if (count == 10)
- pr_warn("hm, tasklist_lock locked, retrying... ");
- if (count) {
- count--;
- pr_cont(" #%d", 10-count);
- mdelay(200);
- goto retry;
- }
- pr_cont(" ignoring it.\n");
- unlock = 0;
- } else {
- if (count != 10)
- pr_cont(" locked it.\n");
- }
-
- do_each_thread(g, p) {
- /*
- * It's not reliable to print a task's held locks
- * if it's not sleeping (or if it's not the current
- * task):
- */
- if (p->state == TASK_RUNNING && p != current)
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ if (!p->lockdep_depth)
continue;
- if (p->lockdep_depth)
- lockdep_print_held_locks(p);
- if (!unlock)
- if (read_trylock(&tasklist_lock))
- unlock = 1;
+ lockdep_print_held_locks(p);
touch_nmi_watchdog();
- } while_each_thread(g, p);
+ touch_all_softlockup_watchdogs();
+ }
+ rcu_read_unlock();
pr_warn("\n");
pr_warn("=============================================\n\n");
-
- if (unlock)
- read_unlock(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(debug_show_all_locks);
#endif
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index ad69bbc9bd28..3dd980dfba2d 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -101,18 +101,6 @@ static const struct seq_operations lockdep_ops = {
.show = l_show,
};
-static int lockdep_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &lockdep_ops);
-}
-
-static const struct file_operations proc_lockdep_operations = {
- .open = lockdep_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
#ifdef CONFIG_PROVE_LOCKING
static void *lc_start(struct seq_file *m, loff_t *pos)
{
@@ -170,18 +158,6 @@ static const struct seq_operations lockdep_chains_ops = {
.stop = lc_stop,
.show = lc_show,
};
-
-static int lockdep_chains_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &lockdep_chains_ops);
-}
-
-static const struct file_operations proc_lockdep_chains_operations = {
- .open = lockdep_chains_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
#endif /* CONFIG_PROVE_LOCKING */
static void lockdep_stats_debug_show(struct seq_file *m)
@@ -355,18 +331,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
return 0;
}
-static int lockdep_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, lockdep_stats_show, NULL);
-}
-
-static const struct file_operations proc_lockdep_stats_operations = {
- .open = lockdep_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
#ifdef CONFIG_LOCK_STAT
struct lock_stat_data {
@@ -682,14 +646,11 @@ static const struct file_operations proc_lock_stat_operations = {
static int __init lockdep_proc_init(void)
{
- proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+ proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops);
#ifdef CONFIG_PROVE_LOCKING
- proc_create("lockdep_chains", S_IRUSR, NULL,
- &proc_lockdep_chains_operations);
+ proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops);
#endif
- proc_create("lockdep_stats", S_IRUSR, NULL,
- &proc_lockdep_stats_operations);
-
+ proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show);
#ifdef CONFIG_LOCK_STAT
proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
&proc_lock_stat_operations);
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index f046b7ce9dd6..5e10153b4d3c 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -23,13 +23,15 @@ struct mcs_spinlock {
#ifndef arch_mcs_spin_lock_contended
/*
- * Using smp_load_acquire() provides a memory barrier that ensures
- * subsequent operations happen after the lock is acquired.
+ * Using smp_cond_load_acquire() provides the acquire semantics
+ * required so that subsequent operations happen after the
+ * lock is acquired. Additionally, some architectures such as
+ * ARM64 would like to do spin-waiting instead of purely
+ * spinning, and smp_cond_load_acquire() provides that behavior.
*/
#define arch_mcs_spin_lock_contended(l) \
do { \
- while (!(smp_load_acquire(l))) \
- cpu_relax(); \
+ smp_cond_load_acquire(l, VAL); \
} while (0)
#endif
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 858a07590e39..f44f658ae629 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struct mutex *lock)
static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
{
unsigned long curr = (unsigned long)current;
+ unsigned long zero = 0UL;
- if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
return true;
return false;
@@ -1082,15 +1083,16 @@ static noinline int __sched
__mutex_lock_interruptible_slowpath(struct mutex *lock);
/**
- * mutex_lock_interruptible - acquire the mutex, interruptible
- * @lock: the mutex to be acquired
+ * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals.
+ * @lock: The mutex to be acquired.
*
- * Lock the mutex like mutex_lock(), and return 0 if the mutex has
- * been acquired or sleep until the mutex becomes available. If a
- * signal arrives while waiting for the lock then this function
- * returns -EINTR.
+ * Lock the mutex like mutex_lock(). If a signal is delivered while the
+ * process is sleeping, this function will return without acquiring the
+ * mutex.
*
- * This function is similar to (but not equivalent to) down_interruptible().
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * signal arrived.
*/
int __sched mutex_lock_interruptible(struct mutex *lock)
{
@@ -1104,6 +1106,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock_interruptible);
+/**
+ * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals.
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). If a signal which will be fatal to
+ * the current process is delivered while the process is sleeping, this
+ * function will return without acquiring the mutex.
+ *
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * fatal signal arrived.
+ */
int __sched mutex_lock_killable(struct mutex *lock)
{
might_sleep();
@@ -1115,6 +1129,16 @@ int __sched mutex_lock_killable(struct mutex *lock)
}
EXPORT_SYMBOL(mutex_lock_killable);
+/**
+ * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). While the task is waiting for this
+ * mutex, it will be accounted as being in the IO wait state by the
+ * scheduler.
+ *
+ * Context: Process context.
+ */
void __sched mutex_lock_io(struct mutex *lock)
{
int token;
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index d880296245c5..bfaeb05123ff 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -12,11 +12,11 @@
* GNU General Public License for more details.
*
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
- * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
* (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
* Peter Zijlstra <peterz@infradead.org>
*/
@@ -33,6 +33,11 @@
#include <asm/qspinlock.h>
/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
* The basic principle of a queue-based spinlock can best be understood
* by studying a classic queue-based spinlock implementation called the
* MCS lock. The paper below provides a good description for this kind
@@ -77,6 +82,18 @@
#endif
/*
+ * The pending bit spinning loop count.
+ * This heuristic is used to limit the number of lockword accesses
+ * made by atomic_cond_read_relaxed when waiting for the lock to
+ * transition out of the "== _Q_PENDING_VAL" state. We don't spin
+ * indefinitely because there's no guarantee that we'll make forward
+ * progress.
+ */
+#ifndef _Q_PENDING_LOOPS
+#define _Q_PENDING_LOOPS 1
+#endif
+
+/*
* Per-CPU queue node structures; we can never have more than 4 nested
* contexts: task, softirq, hardirq, nmi.
*
@@ -114,41 +131,18 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
-/*
- * By using the whole 2nd least significant byte for the pending bit, we
- * can allow better optimization of the lock acquisition for the pending
- * bit holder.
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
*
- * This internal structure is also used by the set_locked function which
- * is not restricted to _Q_PENDING_BITS == 8.
+ * *,1,* -> *,0,*
*/
-struct __qspinlock {
- union {
- atomic_t val;
-#ifdef __LITTLE_ENDIAN
- struct {
- u8 locked;
- u8 pending;
- };
- struct {
- u16 locked_pending;
- u16 tail;
- };
-#else
- struct {
- u16 tail;
- u16 locked_pending;
- };
- struct {
- u8 reserved[2];
- u8 pending;
- u8 locked;
- };
-#endif
- };
-};
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->pending, 0);
+}
-#if _Q_PENDING_BITS == 8
/**
* clear_pending_set_locked - take ownership and clear the pending bit.
* @lock: Pointer to queued spinlock structure
@@ -159,9 +153,7 @@ struct __qspinlock {
*/
static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+ WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
}
/*
@@ -176,19 +168,28 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
*/
static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
- struct __qspinlock *l = (void *)lock;
-
/*
- * Use release semantics to make sure that the MCS node is properly
- * initialized before changing the tail code.
+ * We can use relaxed semantics since the caller ensures that the
+ * MCS node is properly initialized before updating the tail.
*/
- return (u32)xchg_release(&l->tail,
+ return (u32)xchg_relaxed(&lock->tail,
tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
#else /* _Q_PENDING_BITS == 8 */
/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+
+/**
* clear_pending_set_locked - take ownership and clear the pending bit.
* @lock: Pointer to queued spinlock structure
*
@@ -216,10 +217,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
/*
- * Use release semantics to make sure that the MCS node is
- * properly initialized before changing the tail code.
+ * We can use relaxed semantics since the caller ensures that
+ * the MCS node is properly initialized before updating the
+ * tail.
*/
- old = atomic_cmpxchg_release(&lock->val, val, new);
+ old = atomic_cmpxchg_relaxed(&lock->val, val, new);
if (old == val)
break;
@@ -237,9 +239,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
*/
static __always_inline void set_locked(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
}
@@ -294,86 +294,83 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
struct mcs_spinlock *prev, *next, *node;
- u32 new, old, tail;
+ u32 old, tail;
int idx;
BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
if (pv_enabled())
- goto queue;
+ goto pv_queue;
if (virt_spin_lock(lock))
return;
/*
- * wait for in-progress pending->locked hand-overs
+ * Wait for in-progress pending->locked hand-overs with a bounded
+ * number of spins so that we guarantee forward progress.
*
* 0,1,0 -> 0,0,1
*/
if (val == _Q_PENDING_VAL) {
- while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
- cpu_relax();
+ int cnt = _Q_PENDING_LOOPS;
+ val = atomic_cond_read_relaxed(&lock->val,
+ (VAL != _Q_PENDING_VAL) || !cnt--);
}
/*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ /*
* trylock || pending
*
* 0,0,0 -> 0,0,1 ; trylock
* 0,0,1 -> 0,1,1 ; pending
*/
- for (;;) {
+ val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+ if (!(val & ~_Q_LOCKED_MASK)) {
/*
- * If we observe any contention; queue.
+ * We're pending, wait for the owner to go away.
+ *
+ * *,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all
+ * clear_pending_set_locked() implementations imply full
+ * barriers.
*/
- if (val & ~_Q_LOCKED_MASK)
- goto queue;
-
- new = _Q_LOCKED_VAL;
- if (val == new)
- new |= _Q_PENDING_VAL;
+ if (val & _Q_LOCKED_MASK) {
+ atomic_cond_read_acquire(&lock->val,
+ !(VAL & _Q_LOCKED_MASK));
+ }
/*
- * Acquire semantic is required here as the function may
- * return immediately if the lock was free.
+ * take ownership and clear the pending bit.
+ *
+ * *,1,0 -> *,0,1
*/
- old = atomic_cmpxchg_acquire(&lock->val, val, new);
- if (old == val)
- break;
-
- val = old;
- }
-
- /*
- * we won the trylock
- */
- if (new == _Q_LOCKED_VAL)
+ clear_pending_set_locked(lock);
+ qstat_inc(qstat_lock_pending, true);
return;
+ }
/*
- * we're pending, wait for the owner to go away.
- *
- * *,1,1 -> *,1,0
- *
- * this wait loop must be a load-acquire such that we match the
- * store-release that clears the locked bit and create lock
- * sequentiality; this is because not all clear_pending_set_locked()
- * implementations imply full barriers.
- */
- smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
-
- /*
- * take ownership and clear the pending bit.
- *
- * *,1,0 -> *,0,1
+ * If pending was clear but there are waiters in the queue, then
+ * we need to undo our setting of pending before we queue ourselves.
*/
- clear_pending_set_locked(lock);
- return;
+ if (!(val & _Q_PENDING_MASK))
+ clear_pending(lock);
/*
* End of pending bit optimistic spinning and beginning of MCS
* queuing.
*/
queue:
+ qstat_inc(qstat_lock_slowpath, true);
+pv_queue:
node = this_cpu_ptr(&mcs_nodes[0]);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
@@ -400,12 +397,18 @@ queue:
goto release;
/*
+ * Ensure that the initialisation of @node is complete before we
+ * publish the updated tail via xchg_tail() and potentially link
+ * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+ */
+ smp_wmb();
+
+ /*
+ * Publish the updated tail.
* We have already touched the queueing cacheline; don't bother with
* pending stuff.
*
* p,*,* -> n,*,*
- *
- * RELEASE, such that the stores to @node must be complete.
*/
old = xchg_tail(lock, tail);
next = NULL;
@@ -417,14 +420,8 @@ queue:
if (old & _Q_TAIL_MASK) {
prev = decode_tail(old);
- /*
- * We must ensure that the stores to @node are observed before
- * the write to prev->next. The address dependency from
- * xchg_tail is not sufficient to ensure this because the read
- * component of xchg_tail is unordered with respect to the
- * initialisation of @node.
- */
- smp_store_release(&prev->next, node);
+ /* Link @node into the waitqueue. */
+ WRITE_ONCE(prev->next, node);
pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked);
@@ -453,8 +450,8 @@ queue:
*
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
- * smp_cond_load_acquire() call. As the next PV queue head hasn't been
- * designated yet, there is no way for the locked value to become
+ * atomic_cond_read_acquire() call. As the next PV queue head hasn't
+ * been designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
*
@@ -464,44 +461,38 @@ queue:
if ((val = pv_wait_head_or_lock(lock, node)))
goto locked;
- val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
+ val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
locked:
/*
* claim the lock:
*
* n,0,0 -> 0,0,1 : lock, uncontended
- * *,0,0 -> *,0,1 : lock, contended
+ * *,*,0 -> *,*,1 : lock, contended
*
- * If the queue head is the only one in the queue (lock value == tail),
- * clear the tail code and grab the lock. Otherwise, we only need
- * to grab the lock.
+ * If the queue head is the only one in the queue (lock value == tail)
+ * and nobody is pending, clear the tail code and grab the lock.
+ * Otherwise, we only need to grab the lock.
*/
- for (;;) {
- /* In the PV case we might already have _Q_LOCKED_VAL set */
- if ((val & _Q_TAIL_MASK) != tail) {
- set_locked(lock);
- break;
- }
- /*
- * The smp_cond_load_acquire() call above has provided the
- * necessary acquire semantics required for locking. At most
- * two iterations of this loop may be ran.
- */
- old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
- if (old == val)
- goto release; /* No contention */
- val = old;
- }
+ /*
+ * In the PV case we might already have _Q_LOCKED_VAL set.
+ *
+ * The atomic_cond_read_acquire() call above has provided the
+ * necessary acquire semantics required for locking.
+ */
+ if (((val & _Q_TAIL_MASK) == tail) &&
+ atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+ goto release; /* No contention */
+
+ /* Either somebody is queued behind us or _Q_PENDING_VAL is set */
+ set_locked(lock);
/*
* contended path; wait for next if not observed yet, release.
*/
- if (!next) {
- while (!(next = READ_ONCE(node->next)))
- cpu_relax();
- }
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 6ee477765e6c..5a0cf5f9008c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -56,11 +56,6 @@ struct pv_node {
};
/*
- * Include queued spinlock statistics code
- */
-#include "qspinlock_stat.h"
-
-/*
* Hybrid PV queued/unfair lock
*
* By replacing the regular queued_spin_trylock() with the function below,
@@ -87,8 +82,6 @@ struct pv_node {
#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l)
static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
/*
* Stay in unfair lock mode as long as queued mode waiters are
* present in the MCS wait queue but the pending bit isn't set.
@@ -97,7 +90,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
int val = atomic_read(&lock->val);
if (!(val & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
qstat_inc(qstat_pv_lock_stealing, true);
return true;
}
@@ -117,16 +110,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
#if _Q_PENDING_BITS == 8
static __always_inline void set_pending(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->pending, 1);
-}
-
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
- struct __qspinlock *l = (void *)lock;
-
- WRITE_ONCE(l->pending, 0);
+ WRITE_ONCE(lock->pending, 1);
}
/*
@@ -136,10 +120,8 @@ static __always_inline void clear_pending(struct qspinlock *lock)
*/
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
-
- return !READ_ONCE(l->locked) &&
- (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+ return !READ_ONCE(lock->locked) &&
+ (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
_Q_LOCKED_VAL) == _Q_PENDING_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
@@ -148,11 +130,6 @@ static __always_inline void set_pending(struct qspinlock *lock)
atomic_or(_Q_PENDING_VAL, &lock->val);
}
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
- atomic_andnot(_Q_PENDING_VAL, &lock->val);
-}
-
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
int val = atomic_read(&lock->val);
@@ -384,7 +361,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
- struct __qspinlock *l = (void *)lock;
/*
* If the vCPU is indeed halted, advance its state to match that of
@@ -413,7 +389,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
* the hash table later on at unlock time, no atomic instruction is
* needed.
*/
- WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+ WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
(void)pv_hash(lock, pn);
}
@@ -428,7 +404,6 @@ static u32
pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
- struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL;
int waitcnt = 0;
int loop;
@@ -443,7 +418,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
/*
* Tracking # of slowpath locking operations
*/
- qstat_inc(qstat_pv_lock_slowpath, true);
+ qstat_inc(qstat_lock_slowpath, true);
for (;; waitcnt++) {
/*
@@ -479,13 +454,13 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
*
* Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
- if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
+ if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
/*
* The lock was free and now we own the lock.
* Change the lock value back to _Q_LOCKED_VAL
* and unhash the table.
*/
- WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
goto gotlock;
}
@@ -493,7 +468,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
WRITE_ONCE(pn->state, vcpu_hashed);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
- pv_wait(&l->locked, _Q_SLOW_VAL);
+ pv_wait(&lock->locked, _Q_SLOW_VAL);
/*
* Because of lock stealing, the queue head vCPU may not be
@@ -518,7 +493,6 @@ gotlock:
__visible void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
- struct __qspinlock *l = (void *)lock;
struct pv_node *node;
if (unlikely(locked != _Q_SLOW_VAL)) {
@@ -547,7 +521,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
* Now that we have a reference to the (likely) blocked pv_node,
* release the lock.
*/
- smp_store_release(&l->locked, 0);
+ smp_store_release(&lock->locked, 0);
/*
* At this point the memory pointed at by lock can be freed/reused,
@@ -573,7 +547,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
#ifndef __pv_queued_spin_unlock
__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
- struct __qspinlock *l = (void *)lock;
u8 locked;
/*
@@ -581,7 +554,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
+ locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
if (likely(locked == _Q_LOCKED_VAL))
return;
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 4a30ef63c607..6bd78c0740fc 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,13 +22,14 @@
* pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
* pv_latency_kick - average latency (ns) of vCPU kick operation
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
- * pv_lock_slowpath - # of locking operations via the slowpath
* pv_lock_stealing - # of lock stealing operations
* pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
* pv_wait_again - # of wait's after a queue head vCPU kick
* pv_wait_early - # of early vCPU wait's
* pv_wait_head - # of vCPU wait's at the queue head
* pv_wait_node - # of vCPU wait's at a non-head queue node
+ * lock_pending - # of locking operations via pending code
+ * lock_slowpath - # of locking operations via MCS lock queue
*
* Writing to the "reset_counters" file will reset all the above counter
* values.
@@ -46,13 +47,14 @@ enum qlock_stats {
qstat_pv_kick_wake,
qstat_pv_latency_kick,
qstat_pv_latency_wake,
- qstat_pv_lock_slowpath,
qstat_pv_lock_stealing,
qstat_pv_spurious_wakeup,
qstat_pv_wait_again,
qstat_pv_wait_early,
qstat_pv_wait_head,
qstat_pv_wait_node,
+ qstat_lock_pending,
+ qstat_lock_slowpath,
qstat_num, /* Total number of statistical counters */
qstat_reset_cnts = qstat_num,
};
@@ -73,12 +75,13 @@ static const char * const qstat_names[qstat_num + 1] = {
[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
[qstat_pv_latency_kick] = "pv_latency_kick",
[qstat_pv_latency_wake] = "pv_latency_wake",
- [qstat_pv_lock_slowpath] = "pv_lock_slowpath",
[qstat_pv_lock_stealing] = "pv_lock_stealing",
[qstat_pv_wait_again] = "pv_wait_again",
[qstat_pv_wait_early] = "pv_wait_early",
[qstat_pv_wait_head] = "pv_wait_head",
[qstat_pv_wait_node] = "pv_wait_node",
+ [qstat_lock_pending] = "lock_pending",
+ [qstat_lock_slowpath] = "lock_slowpath",
[qstat_reset_cnts] = "reset_counters",
};
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 940633c63254..4f014be7a4b8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1268,8 +1268,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
if (unlikely(ret)) {
__set_current_state(TASK_RUNNING);
- if (rt_mutex_has_waiters(lock))
- remove_waiter(lock, &waiter);
+ remove_waiter(lock, &waiter);
rt_mutex_handle_deadlock(ret, chwalk, &waiter);
}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 68686b3ec3c1..d1d62f942be2 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -52,12 +52,13 @@ static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
static inline struct rt_mutex_waiter *
rt_mutex_top_waiter(struct rt_mutex *lock)
{
- struct rt_mutex_waiter *w;
-
- w = rb_entry(lock->waiters.rb_leftmost,
- struct rt_mutex_waiter, tree_entry);
- BUG_ON(w->lock != lock);
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+ struct rt_mutex_waiter *w = NULL;
+ if (leftmost) {
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+ BUG_ON(w->lock != lock);
+ }
return w;
}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index e795908f3607..3064c50e181e 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -347,30 +347,31 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
}
}
+static inline bool owner_on_cpu(struct task_struct *owner)
+{
+ /*
+ * As lock holder preemption issue, we both skip spinning if
+ * task is not on cpu or its cpu is preempted
+ */
+ return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+}
+
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
bool ret = true;
+ BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
+
if (need_resched())
return false;
rcu_read_lock();
owner = READ_ONCE(sem->owner);
- if (!rwsem_owner_is_writer(owner)) {
- /*
- * Don't spin if the rwsem is readers owned.
- */
- ret = !rwsem_owner_is_reader(owner);
- goto done;
+ if (owner) {
+ ret = is_rwsem_owner_spinnable(owner) &&
+ owner_on_cpu(owner);
}
-
- /*
- * As lock holder preemption issue, we both skip spinning if task is not
- * on cpu or its cpu is preempted
- */
- ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-done:
rcu_read_unlock();
return ret;
}
@@ -382,11 +383,11 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner = READ_ONCE(sem->owner);
- if (!rwsem_owner_is_writer(owner))
- goto out;
+ if (!is_rwsem_owner_spinnable(owner))
+ return false;
rcu_read_lock();
- while (sem->owner == owner) {
+ while (owner && (READ_ONCE(sem->owner) == owner)) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
* checking sem->owner still matches owner, if that fails,
@@ -399,8 +400,7 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
* abort spinning when need_resched or owner is not running or
* owner's cpu is preempted.
*/
- if (!owner->on_cpu || need_resched() ||
- vcpu_is_preempted(task_cpu(owner))) {
+ if (need_resched() || !owner_on_cpu(owner)) {
rcu_read_unlock();
return false;
}
@@ -408,12 +408,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
cpu_relax();
}
rcu_read_unlock();
-out:
+
/*
* If there is a new owner or the owner is not set, we continue
* spinning.
*/
- return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
+ return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index f549c552dbf1..bc1e507be9ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -117,6 +117,7 @@ EXPORT_SYMBOL(down_write_trylock);
void up_read(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED);
__up_read(sem);
}
@@ -129,6 +130,7 @@ EXPORT_SYMBOL(up_read);
void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ DEBUG_RWSEMS_WARN_ON(sem->owner != current);
rwsem_clear_owner(sem);
__up_write(sem);
@@ -142,6 +144,7 @@ EXPORT_SYMBOL(up_write);
void downgrade_write(struct rw_semaphore *sem)
{
lock_downgrade(&sem->dep_map, _RET_IP_);
+ DEBUG_RWSEMS_WARN_ON(sem->owner != current);
rwsem_set_reader_owned(sem);
__downgrade_write(sem);
@@ -211,11 +214,10 @@ EXPORT_SYMBOL(down_write_killable_nested);
void up_read_non_owner(struct rw_semaphore *sem)
{
+ DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED);
__up_read(sem);
}
EXPORT_SYMBOL(up_read_non_owner);
#endif
-
-
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a883b8f1fdc6..b9d0e72aa80f 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,20 +1,30 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* The owner field of the rw_semaphore structure will be set to
- * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear
* the owner field when it unlocks. A reader, on the other hand, will
* not touch the owner field when it unlocks.
*
- * In essence, the owner field now has the following 3 states:
+ * In essence, the owner field now has the following 4 states:
* 1) 0
* - lock is free or the owner hasn't set the field yet
* 2) RWSEM_READER_OWNED
* - lock is currently or previously owned by readers (lock is free
* or not set by owner yet)
- * 3) Other non-zero value
- * - a writer owns the lock
+ * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well
+ * - lock is owned by an anonymous writer, so spinning on the lock
+ * owner should be disabled.
+ * 4) Other non-zero value
+ * - a writer owns the lock and other writers can spin on the lock owner.
*/
-#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
+#define RWSEM_ANONYMOUSLY_OWNED (1UL << 0)
+#define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED)
+
+#ifdef CONFIG_DEBUG_RWSEMS
+# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c)
+#endif
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
@@ -41,18 +51,26 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
* do a write to the rwsem cacheline when it is really necessary
* to minimize cacheline contention.
*/
- if (sem->owner != RWSEM_READER_OWNED)
+ if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED)
WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
}
-static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+/*
+ * Return true if the a rwsem waiter can spin on the rwsem's owner
+ * and steal the lock, i.e. the lock is not anonymously owned.
+ * N.B. !owner is considered spinnable.
+ */
+static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
{
- return owner && owner != RWSEM_READER_OWNED;
+ return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
}
-static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+/*
+ * Return true if rwsem is owned by an anonymous writer or readers.
+ */
+static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
{
- return owner == RWSEM_READER_OWNED;
+ return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
}
#else
static inline void rwsem_set_owner(struct rw_semaphore *sem)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 4dd4274cabe2..895e6b76b25e 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -427,7 +427,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
err_pfn_remap:
err_radix:
pgmap_radix_release(res, pgoff);
- devres_free(pgmap);
return ERR_PTR(error);
}
EXPORT_SYMBOL(devm_memremap_pages);
diff --git a/kernel/module.c b/kernel/module.c
index ad2d420024f6..68469b37d61a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1472,7 +1472,8 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
{
struct module_sect_attr *sattr =
container_of(mattr, struct module_sect_attr, mattr);
- return sprintf(buf, "0x%pK\n", (void *)sattr->address);
+ return sprintf(buf, "0x%px\n", kptr_restrict < 2 ?
+ (void *)sattr->address : NULL);
}
static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1603,8 +1604,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info)
if (notes == 0)
return;
- notes_attrs = kzalloc(sizeof(*notes_attrs)
- + notes * sizeof(notes_attrs->attrs[0]),
+ notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
GFP_KERNEL);
if (notes_attrs == NULL)
return;
@@ -2181,10 +2181,6 @@ static void free_module(struct module *mod)
/* Finally, free the core (containing the module structure) */
disable_ro_nx(&mod->core_layout);
module_memfree(mod->core_layout.base);
-
-#ifdef CONFIG_MPU
- update_protections(current->mm);
-#endif
}
void *__symbol_get(const char *symbol)
@@ -3520,6 +3516,11 @@ static noinline int do_init_module(struct module *mod)
* walking this with preempt disabled. In all the failure paths, we
* call synchronize_sched(), but we don't want to slow down the success
* path, so use actual RCU here.
+ * Note that module_alloc() on most architectures creates W+X page
+ * mappings which won't be cleaned up until do_free_init() runs. Any
+ * code such as mark_rodata_ro() which depends on those mappings to
+ * be cleaned up needs to sync with the queued work - ie
+ * rcu_barrier_sched()
*/
call_rcu_sched(&freeinit->rcu, do_free_init);
mutex_unlock(&module_mutex);
@@ -4228,7 +4229,7 @@ static int modules_open(struct inode *inode, struct file *file)
m->private = kallsyms_show_value() ? NULL : (void *)8ul;
}
- return 0;
+ return err;
}
static const struct file_operations proc_modules_operations = {
diff --git a/kernel/panic.c b/kernel/panic.c
index 4b794f1d8561..42e487488554 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,7 +34,8 @@
#define PANIC_BLINK_SPD 18
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
-static unsigned long tainted_mask;
+static unsigned long tainted_mask =
+ IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -289,7 +290,7 @@ void panic(const char *fmt, ...)
disabled_wait(caller);
}
#endif
- pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf);
+ pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
@@ -308,52 +309,40 @@ EXPORT_SYMBOL(panic);
* is being removed anyway.
*/
const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
- { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */
- { 'F', ' ', true }, /* TAINT_FORCED_MODULE */
- { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */
- { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */
- { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */
- { 'B', ' ', false }, /* TAINT_BAD_PAGE */
- { 'U', ' ', false }, /* TAINT_USER */
- { 'D', ' ', false }, /* TAINT_DIE */
- { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */
- { 'W', ' ', false }, /* TAINT_WARN */
- { 'C', ' ', true }, /* TAINT_CRAP */
- { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */
- { 'O', ' ', true }, /* TAINT_OOT_MODULE */
- { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */
- { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */
- { 'K', ' ', true }, /* TAINT_LIVEPATCH */
- { 'X', ' ', true }, /* TAINT_AUX */
+ [ TAINT_PROPRIETARY_MODULE ] = { 'P', 'G', true },
+ [ TAINT_FORCED_MODULE ] = { 'F', ' ', true },
+ [ TAINT_CPU_OUT_OF_SPEC ] = { 'S', ' ', false },
+ [ TAINT_FORCED_RMMOD ] = { 'R', ' ', false },
+ [ TAINT_MACHINE_CHECK ] = { 'M', ' ', false },
+ [ TAINT_BAD_PAGE ] = { 'B', ' ', false },
+ [ TAINT_USER ] = { 'U', ' ', false },
+ [ TAINT_DIE ] = { 'D', ' ', false },
+ [ TAINT_OVERRIDDEN_ACPI_TABLE ] = { 'A', ' ', false },
+ [ TAINT_WARN ] = { 'W', ' ', false },
+ [ TAINT_CRAP ] = { 'C', ' ', true },
+ [ TAINT_FIRMWARE_WORKAROUND ] = { 'I', ' ', false },
+ [ TAINT_OOT_MODULE ] = { 'O', ' ', true },
+ [ TAINT_UNSIGNED_MODULE ] = { 'E', ' ', true },
+ [ TAINT_SOFTLOCKUP ] = { 'L', ' ', false },
+ [ TAINT_LIVEPATCH ] = { 'K', ' ', true },
+ [ TAINT_AUX ] = { 'X', ' ', true },
+ [ TAINT_RANDSTRUCT ] = { 'T', ' ', true },
};
/**
- * print_tainted - return a string to represent the kernel taint state.
+ * print_tainted - return a string to represent the kernel taint state.
*
- * 'P' - Proprietary module has been loaded.
- * 'F' - Module has been forcibly loaded.
- * 'S' - SMP with CPUs not designed for SMP.
- * 'R' - User forced a module unload.
- * 'M' - System experienced a machine check exception.
- * 'B' - System has hit bad_page.
- * 'U' - Userspace-defined naughtiness.
- * 'D' - Kernel has oopsed before
- * 'A' - ACPI table overridden.
- * 'W' - Taint on warning.
- * 'C' - modules from drivers/staging are loaded.
- * 'I' - Working around severe firmware bug.
- * 'O' - Out-of-tree module has been loaded.
- * 'E' - Unsigned module has been loaded.
- * 'L' - A soft lockup has previously occurred.
- * 'K' - Kernel has been live patched.
- * 'X' - Auxiliary taint, for distros' use.
+ * For individual taint flag meanings, see Documentation/sysctl/kernel.txt
*
- * The string is overwritten by the next call to print_tainted().
+ * The string is overwritten by the next call to print_tainted(),
+ * but is always NULL terminated.
*/
const char *print_tainted(void)
{
static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];
+ BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);
+
if (tainted_mask) {
char *s;
int i;
@@ -554,6 +543,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
else
dump_stack();
+ print_irqtrace_events(current);
+
print_oops_end_marker();
/* Just a warning, don't kill lockdep. */
diff --git a/kernel/params.c b/kernel/params.c
index cc9108c2a1fd..ce89f757e6da 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -111,8 +111,8 @@ bool parameq(const char *a, const char *b)
static void param_check_unsafe(const struct kernel_param *kp)
{
if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
- pr_warn("Setting dangerous option %s - tainting kernel\n",
- kp->name);
+ pr_notice("Setting dangerous option %s - tainting kernel\n",
+ kp->name);
add_taint(TAINT_USER, LOCKDEP_STILL_OK);
}
}
diff --git a/kernel/pid.c b/kernel/pid.c
index ed6c343fe50d..157fe4b19971 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -70,7 +70,7 @@ int pid_max_max = PID_MAX_LIMIT;
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .idr = IDR_INIT,
+ .idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0b53eef7d34b..2a2ac53d8b8b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,55 +23,39 @@
#include <linux/sched/signal.h>
#include <linux/idr.h>
-struct pid_cache {
- int nr_ids;
- char name[16];
- struct kmem_cache *cachep;
- struct list_head list;
-};
-
-static LIST_HEAD(pid_caches_lh);
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
+/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
+#define MAX_PID_NS_LEVEL 32
+/* Write once array, filled from the beginning. */
+static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];
/*
* creates the kmem cache to allocate pids from.
- * @nr_ids: the number of numerical ids this pid will have to carry
+ * @level: pid namespace level
*/
-static struct kmem_cache *create_pid_cachep(int nr_ids)
+static struct kmem_cache *create_pid_cachep(unsigned int level)
{
- struct pid_cache *pcache;
- struct kmem_cache *cachep;
-
+ /* Level 0 is init_pid_ns.pid_cachep */
+ struct kmem_cache **pkc = &pid_cache[level - 1];
+ struct kmem_cache *kc;
+ char name[4 + 10 + 1];
+ unsigned int len;
+
+ kc = READ_ONCE(*pkc);
+ if (kc)
+ return kc;
+
+ snprintf(name, sizeof(name), "pid_%u", level + 1);
+ len = sizeof(struct pid) + level * sizeof(struct upid);
mutex_lock(&pid_caches_mutex);
- list_for_each_entry(pcache, &pid_caches_lh, list)
- if (pcache->nr_ids == nr_ids)
- goto out;
-
- pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
- if (pcache == NULL)
- goto err_alloc;
-
- snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
- cachep = kmem_cache_create(pcache->name,
- sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (cachep == NULL)
- goto err_cachep;
-
- pcache->nr_ids = nr_ids;
- pcache->cachep = cachep;
- list_add(&pcache->list, &pid_caches_lh);
-out:
+ /* Name collision forces to do allocation under mutex. */
+ if (!*pkc)
+ *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0);
mutex_unlock(&pid_caches_mutex);
- return pcache->cachep;
-
-err_cachep:
- kfree(pcache);
-err_alloc:
- mutex_unlock(&pid_caches_mutex);
- return NULL;
+ /* current can fail, but someone else can succeed. */
+ return READ_ONCE(*pkc);
}
static void proc_cleanup_work(struct work_struct *work)
@@ -80,9 +64,6 @@ static void proc_cleanup_work(struct work_struct *work)
pid_ns_release_proc(ns);
}
-/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
-#define MAX_PID_NS_LEVEL 32
-
static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
{
return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
@@ -119,7 +100,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
idr_init(&ns->idr);
- ns->pid_cachep = create_pid_cachep(level + 1);
+ ns->pid_cachep = create_pid_cachep(level);
if (ns->pid_cachep == NULL)
goto out_free_idr;
@@ -242,16 +223,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
/*
* Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
- * sys_wait4() will also block until our children traced from the
+ * kernel_wait4() will also block until our children traced from the
* parent namespace are detached and become EXIT_DEAD.
*/
do {
clear_thread_flag(TIF_SIGPENDING);
- rc = sys_wait4(-1, NULL, __WALL, NULL);
+ rc = kernel_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
/*
- * sys_wait4() above can't reap the EXIT_DEAD children but we do not
+ * kernel_wait4() above can't reap the EXIT_DEAD children but we do not
* really care, we could reparent them to the global init. We could
* exit and reap ->child_reaper even if it is not the last thread in
* this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a5c36e9c56a6..9c85c7822383 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -287,6 +287,8 @@ static int create_image(int platform_mode)
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
+
error = syscore_suspend();
if (error) {
pr_err("Some system devices failed to power down, aborting hibernation\n");
@@ -317,6 +319,7 @@ static int create_image(int platform_mode)
syscore_resume();
Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
@@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode)
goto Enable_cpus;
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
error = syscore_suspend();
if (error)
@@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode)
syscore_resume();
Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
@@ -563,6 +568,7 @@ int hibernation_platform_enter(void)
goto Enable_cpus;
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
syscore_suspend();
if (pm_wakeup_pending()) {
error = -EAGAIN;
@@ -575,6 +581,7 @@ int hibernation_platform_enter(void)
Power_up:
syscore_resume();
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
@@ -701,7 +708,7 @@ int hibernate(void)
}
pr_info("Syncing filesystems ... \n");
- sys_sync();
+ ksys_sync();
pr_info("done.\n");
error = freeze_processes();
@@ -1053,7 +1060,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
lock_system_sleep();
swsusp_resume_device = res;
unlock_system_sleep();
- pr_info("Starting manual resume from disk\n");
+ pm_pr_dbg("Configured resume from disk to %u\n", swsusp_resume_device);
noresume = 0;
software_resume();
return n;
@@ -1061,6 +1068,29 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(resume);
+static ssize_t resume_offset_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)swsusp_resume_block);
+}
+
+static ssize_t resume_offset_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t n)
+{
+ unsigned long long offset;
+ int rc;
+
+ rc = kstrtoull(buf, 0, &offset);
+ if (rc)
+ return rc;
+ swsusp_resume_block = offset;
+
+ return n;
+}
+
+power_attr(resume_offset);
+
static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -1106,6 +1136,7 @@ power_attr(reserved_size);
static struct attribute * g[] = {
&disk_attr.attr,
+ &resume_offset_attr.attr,
&resume_attr.attr,
&image_size_attr.attr,
&reserved_size_attr.attr,
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9d7503910ce2..86d72ffb811b 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -184,7 +184,6 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
c->target_value = value;
}
-static inline int pm_qos_get_value(struct pm_qos_constraints *c);
static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
{
struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
@@ -295,6 +294,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
* changed
*/
plist_del(node, &c->list);
+ /* fall through */
case PM_QOS_ADD_REQ:
plist_node_init(node, new_value);
plist_add(node, &c->list);
@@ -367,6 +367,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
break;
case PM_QOS_UPDATE_REQ:
pm_qos_flags_remove_req(pqf, req);
+ /* fall through */
case PM_QOS_ADD_REQ:
req->flags = val;
INIT_LIST_HEAD(&req->node);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0685c4499431..87331565e505 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
+#include <linux/swait.h>
#include <linux/ftrace.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
@@ -57,10 +58,10 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_s2idle_ops *s2idle_ops;
-static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
+static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
enum s2idle_states __read_mostly s2idle_state;
-static DEFINE_SPINLOCK(s2idle_lock);
+static DEFINE_RAW_SPINLOCK(s2idle_lock);
void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
@@ -78,12 +79,12 @@ static void s2idle_enter(void)
{
trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
- spin_lock_irq(&s2idle_lock);
+ raw_spin_lock_irq(&s2idle_lock);
if (pm_wakeup_pending())
goto out;
s2idle_state = S2IDLE_STATE_ENTER;
- spin_unlock_irq(&s2idle_lock);
+ raw_spin_unlock_irq(&s2idle_lock);
get_online_cpus();
cpuidle_resume();
@@ -91,17 +92,17 @@ static void s2idle_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
- wait_event(s2idle_wait_head,
- s2idle_state == S2IDLE_STATE_WAKE);
+ swait_event(s2idle_wait_head,
+ s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
put_online_cpus();
- spin_lock_irq(&s2idle_lock);
+ raw_spin_lock_irq(&s2idle_lock);
out:
s2idle_state = S2IDLE_STATE_NONE;
- spin_unlock_irq(&s2idle_lock);
+ raw_spin_unlock_irq(&s2idle_lock);
trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
}
@@ -156,12 +157,12 @@ void s2idle_wake(void)
{
unsigned long flags;
- spin_lock_irqsave(&s2idle_lock, flags);
+ raw_spin_lock_irqsave(&s2idle_lock, flags);
if (s2idle_state > S2IDLE_STATE_NONE) {
s2idle_state = S2IDLE_STATE_WAKE;
- wake_up(&s2idle_wait_head);
+ swake_up(&s2idle_wait_head);
}
- spin_unlock_irqrestore(&s2idle_lock, flags);
+ raw_spin_unlock_irqrestore(&s2idle_lock, flags);
}
EXPORT_SYMBOL_GPL(s2idle_wake);
@@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
+ system_state = SYSTEM_SUSPEND;
+
error = syscore_suspend();
if (!error) {
*wakeup = pm_wakeup_pending();
@@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
syscore_resume();
}
+ system_state = SYSTEM_RUNNING;
+
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
@@ -560,7 +565,7 @@ static int enter_state(suspend_state_t state)
#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
pr_info("Syncing filesystems ... ");
- sys_sync();
+ ksys_sync();
pr_cont("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
#endif
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11b4282c2d20..1efcb5b0c3ed 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -269,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
+ bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
bio_set_dev(bio, hib_resume_bdev);
bio_set_op_attrs(bio, op, op_flags);
@@ -376,7 +376,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
return -ENOSPC;
if (hb) {
- src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
+ src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY);
if (src) {
copy_page(src, buf);
@@ -384,7 +384,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
ret = hib_wait_io(hb); /* Free pages */
if (ret)
return ret;
- src = (void *)__get_free_page(__GFP_RECLAIM |
+ src = (void *)__get_free_page(GFP_NOIO |
__GFP_NOWARN |
__GFP_NORETRY);
if (src) {
@@ -691,7 +691,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
+ page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH);
if (!page) {
pr_err("Failed to allocate LZO page\n");
ret = -ENOMEM;
@@ -989,7 +989,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
last = tmp;
tmp->map = (struct swap_map_page *)
- __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
+ __get_free_page(GFP_NOIO | __GFP_HIGH);
if (!tmp->map) {
release_swap_reader(handle);
return -ENOMEM;
@@ -1261,8 +1261,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
for (i = 0; i < read_pages; i++) {
page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
- __GFP_RECLAIM | __GFP_HIGH :
- __GFP_RECLAIM | __GFP_NOWARN |
+ GFP_NOIO | __GFP_HIGH :
+ GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY);
if (!page[i]) {
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 22df9f7ff672..abd225550271 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -186,6 +186,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
res = PAGE_SIZE - pg_offp;
}
+ if (!data_of(data->handle)) {
+ res = -EINVAL;
+ goto unlock;
+ }
+
res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
buf, count);
if (res > 0)
@@ -224,7 +229,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
break;
printk("Syncing filesystems ... ");
- sys_sync();
+ ksys_sync();
printk("done.\n");
error = freeze_processes();
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index dfba59be190b..4210152e56f0 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -188,6 +188,7 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
return ERR_PTR(-ENOMEM);
}
wl->ws.name = wl->name;
+ wl->ws.last_time = ktime_get();
wakeup_source_add(&wl->ws);
rb_link_node(&wl->node, parent, node);
rb_insert_color(&wl->node, &wakelocks_tree);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f274fbef821d..247808333ba4 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -42,7 +42,6 @@
#include <linux/rculist.h>
#include <linux/poll.h>
#include <linux/irq_work.h>
-#include <linux/utsname.h>
#include <linux/ctype.h>
#include <linux/uio.h>
#include <linux/sched/clock.h>
@@ -52,6 +51,7 @@
#include <linux/uaccess.h>
#include <asm/sections.h>
+#include <trace/events/initcall.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
@@ -1908,6 +1908,7 @@ asmlinkage int vprintk_emit(int facility, int level,
preempt_enable();
}
+ wake_up_klogd();
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -2162,7 +2163,7 @@ void suspend_console(void)
{
if (!console_suspend_enabled)
return;
- printk("Suspending console(s) (use no_console_suspend to debug)\n");
+ pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
console_lock();
console_suspended = 1;
up_console_sem();
@@ -2289,9 +2290,7 @@ void console_unlock(void)
{
static char ext_text[CONSOLE_EXT_LOG_MAX];
static char text[LOG_LINE_MAX + PREFIX_MAX];
- static u64 seen_seq;
unsigned long flags;
- bool wake_klogd = false;
bool do_cond_resched, retry;
if (console_suspended) {
@@ -2335,11 +2334,6 @@ again:
printk_safe_enter_irqsave(flags);
raw_spin_lock(&logbuf_lock);
- if (seen_seq != log_next_seq) {
- wake_klogd = true;
- seen_seq = log_next_seq;
- }
-
if (console_seq < log_first_seq) {
len = sprintf(text, "** %u printk messages dropped **\n",
(unsigned)(log_first_seq - console_seq));
@@ -2397,7 +2391,7 @@ skip:
if (console_lock_spinning_disable_and_check()) {
printk_safe_exit_irqrestore(flags);
- goto out;
+ return;
}
printk_safe_exit_irqrestore(flags);
@@ -2429,10 +2423,6 @@ skip:
if (retry && console_trylock())
goto again;
-
-out:
- if (wake_klogd)
- wake_up_klogd();
}
EXPORT_SYMBOL(console_unlock);
@@ -2781,6 +2771,7 @@ EXPORT_SYMBOL(unregister_console);
*/
void __init console_init(void)
{
+ int ret;
initcall_t *call;
/* Setup the default TTY line discipline. */
@@ -2791,8 +2782,11 @@ void __init console_init(void)
* inform about problems etc..
*/
call = __con_initcall_start;
+ trace_initcall_level("console");
while (call < __con_initcall_end) {
- (*call)();
+ trace_initcall_start((*call));
+ ret = (*call)();
+ trace_initcall_finish((*call), ret);
call++;
}
}
@@ -3257,60 +3251,4 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
-static char dump_stack_arch_desc_str[128];
-
-/**
- * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
- * @fmt: printf-style format string
- * @...: arguments for the format string
- *
- * The configured string will be printed right after utsname during task
- * dumps. Usually used to add arch-specific system identifiers. If an
- * arch wants to make use of such an ID string, it should initialize this
- * as soon as possible during boot.
- */
-void __init dump_stack_set_arch_desc(const char *fmt, ...)
-{
- va_list args;
-
- va_start(args, fmt);
- vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
- fmt, args);
- va_end(args);
-}
-
-/**
- * dump_stack_print_info - print generic debug info for dump_stack()
- * @log_lvl: log level
- *
- * Arch-specific dump_stack() implementations can use this function to
- * print out the same debug information as the generic dump_stack().
- */
-void dump_stack_print_info(const char *log_lvl)
-{
- printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
- log_lvl, raw_smp_processor_id(), current->pid, current->comm,
- print_tainted(), init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
-
- if (dump_stack_arch_desc_str[0] != '\0')
- printk("%sHardware name: %s\n",
- log_lvl, dump_stack_arch_desc_str);
-
- print_worker_info(log_lvl, current);
-}
-
-/**
- * show_regs_print_info - print generic debug info for show_regs()
- * @log_lvl: log level
- *
- * show_regs() implementations can use this function to print out generic
- * debug information.
- */
-void show_regs_print_info(const char *log_lvl)
-{
- dump_stack_print_info(log_lvl);
-}
-
#endif
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3e3c2004bb23..d7d091309054 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -82,6 +82,7 @@ static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
{
int add;
size_t len;
+ va_list ap;
again:
len = atomic_read(&s->len);
@@ -100,7 +101,9 @@ again:
if (!len)
smp_rmb();
- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+ va_copy(ap, args);
+ add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
+ va_end(ap);
if (!add)
return 0;
@@ -278,7 +281,7 @@ void printk_safe_flush_on_panic(void)
* Make sure that we could access the main ring buffer.
* Do not risk a double release when more CPUs are up.
*/
- if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+ if (raw_spin_is_locked(&logbuf_lock)) {
if (num_online_cpus() > 1)
return;
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 6334f2c1abd0..40cea6735c2d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp)
WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
}
+/* Compute the end-of-grace-period value for the specified sequence number. */
+static inline unsigned long rcu_seq_endval(unsigned long *sp)
+{
+ return (*sp | RCU_SEQ_STATE_MASK) + 1;
+}
+
/* Adjust sequence number for end of update-side operation. */
static inline void rcu_seq_end(unsigned long *sp)
{
smp_mb(); /* Ensure update-side operation before counter increment. */
WARN_ON_ONCE(!rcu_seq_state(*sp));
- WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1);
+ WRITE_ONCE(*sp, rcu_seq_endval(sp));
}
/* Take a snapshot of the update side's sequence number. */
@@ -264,6 +270,12 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
}
}
+/* Returns first leaf rcu_node of the specified RCU flavor. */
+#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1])
+
+/* Is this rcu_node a leaf? */
+#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1)
+
/*
* Do a full breadth-first scan of the rcu_node structures for the
* specified rcu_state structure.
@@ -278,8 +290,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* rcu_node tree with but one rcu_node structure, this loop is a no-op.
*/
#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
- for ((rnp) = &(rsp)->node[0]; \
- (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+ for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++)
/*
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -288,16 +299,26 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* It is still a leaf node, even if it is also the root node.
*/
#define rcu_for_each_leaf_node(rsp, rnp) \
- for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+ for ((rnp) = rcu_first_leaf_node(rsp); \
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/*
* Iterate over all possible CPUs in a leaf RCU node.
*/
#define for_each_leaf_node_possible_cpu(rnp, cpu) \
- for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
- cpu <= rnp->grphi; \
- cpu = cpumask_next((cpu), cpu_possible_mask))
+ for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \
+ (cpu) <= rnp->grphi; \
+ (cpu) = cpumask_next((cpu), cpu_possible_mask))
+
+/*
+ * Iterate over all CPUs in a leaf RCU node's specified mask.
+ */
+#define rcu_find_next_bit(rnp, cpu, mask) \
+ ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu)))
+#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \
+ for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \
+ (cpu) <= rnp->grphi; \
+ (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
/*
* Wrappers for the rcu_node::lock acquire and release.
@@ -337,7 +358,7 @@ do { \
} while (0)
#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
- raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)
#define raw_spin_trylock_rcu_node(p) \
({ \
@@ -348,6 +369,9 @@ do { \
___locked; \
})
+#define raw_lockdep_assert_held_rcu_node(p) \
+ lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
+
#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
#ifdef CONFIG_TINY_RCU
@@ -356,24 +380,20 @@ static inline bool rcu_gp_is_normal(void) { return true; }
static inline bool rcu_gp_is_expedited(void) { return false; }
static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
+static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
void rcupdate_announce_bootup_oddness(void);
+void rcu_request_urgent_qs_task(struct task_struct *t);
#endif /* #else #ifdef CONFIG_TINY_RCU */
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
-#ifdef CONFIG_TINY_RCU
-static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
-#else /* #ifdef CONFIG_TINY_RCU */
-void rcu_request_urgent_qs_task(struct task_struct *t);
-#endif /* #else #ifdef CONFIG_TINY_RCU */
-
enum rcutorture_type {
RCU_FLAVOR,
RCU_BH_FLAVOR,
@@ -470,6 +490,8 @@ void show_rcu_gp_kthreads(void);
void rcu_force_quiescent_state(void);
void rcu_bh_force_quiescent_state(void);
void rcu_sched_force_quiescent_state(void);
+extern struct workqueue_struct *rcu_gp_wq;
+extern struct workqueue_struct *rcu_par_gp_wq;
#endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 88cba7c2956c..5aff271adf1e 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -404,24 +404,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
}
/*
- * Scan the specified rcu_segcblist structure for callbacks that need
- * a grace period later than the one specified by "seq". We don't look
- * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
- * have a grace-period sequence number.
- */
-bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
- unsigned long seq)
-{
- int i;
-
- for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
- if (rsclp->tails[i - 1] != rsclp->tails[i] &&
- ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
- return true;
- return false;
-}
-
-/*
* Merge the source rcu_segcblist structure into the destination
* rcu_segcblist structure, then initialize the source. Any pending
* callbacks from the source get to start over. It is best to
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 581c12b63544..948470cef385 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp);
void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
-bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
- unsigned long seq);
void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index d1ebdf9868bb..e232846516b3 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
#define VERBOSE_PERFOUT_ERRSTRING(s) \
do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
+/*
+ * The intended use cases for the nreaders and nwriters module parameters
+ * are as follows:
+ *
+ * 1. Specify only the nr_cpus kernel boot parameter. This will
+ * set both nreaders and nwriters to the value specified by
+ * nr_cpus for a mixed reader/writer test.
+ *
+ * 2. Specify the nr_cpus kernel boot parameter, but set
+ * rcuperf.nreaders to zero. This will set nwriters to the
+ * value specified by nr_cpus for an update-only test.
+ *
+ * 3. Specify the nr_cpus kernel boot parameter, but set
+ * rcuperf.nwriters to zero. This will set nreaders to the
+ * value specified by nr_cpus for a read-only test.
+ *
+ * Various other use cases may of course be specified.
+ */
+
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
-torture_param(int, nreaders, 0, "Number of RCU reader threads");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
torture_param(bool, shutdown, !IS_ENABLED(MODULE),
"Shutdown at end of performance tests.");
@@ -350,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void)
*/
static void rcu_perf_wait_shutdown(void)
{
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
return;
while (!torture_must_stop())
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 308e6fdbced8..e628fcfd1bde 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -593,7 +593,12 @@ static void srcu_torture_init(void)
static void srcu_torture_cleanup(void)
{
- cleanup_srcu_struct(&srcu_ctld);
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ if (torture_random(&rand) & 0x800)
+ cleanup_srcu_struct(&srcu_ctld);
+ else
+ cleanup_srcu_struct_quiesced(&srcu_ctld);
srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
}
@@ -909,34 +914,38 @@ rcu_torture_writer(void *arg)
int nsynctypes = 0;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
- if (!can_expedite) {
+ if (!can_expedite)
pr_alert("%s" TORTURE_FLAG
- " GP expediting controlled from boot/sysfs for %s,\n",
+ " GP expediting controlled from boot/sysfs for %s.\n",
torture_type, cur_ops->name);
- pr_alert("%s" TORTURE_FLAG
- " Disabled dynamic grace-period expediting.\n",
- torture_type);
- }
/* Initialize synctype[] array. If none set, take default. */
if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
- if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
+ if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
- else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
- pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
- if (gp_exp1 && cur_ops->exp_sync)
+ pr_info("%s: Testing conditional GPs.\n", __func__);
+ } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) {
+ pr_alert("%s: gp_cond without primitives.\n", __func__);
+ }
+ if (gp_exp1 && cur_ops->exp_sync) {
synctype[nsynctypes++] = RTWS_EXP_SYNC;
- else if (gp_exp && !cur_ops->exp_sync)
- pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
- if (gp_normal1 && cur_ops->deferred_free)
+ pr_info("%s: Testing expedited GPs.\n", __func__);
+ } else if (gp_exp && !cur_ops->exp_sync) {
+ pr_alert("%s: gp_exp without primitives.\n", __func__);
+ }
+ if (gp_normal1 && cur_ops->deferred_free) {
synctype[nsynctypes++] = RTWS_DEF_FREE;
- else if (gp_normal && !cur_ops->deferred_free)
- pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
- if (gp_sync1 && cur_ops->sync)
+ pr_info("%s: Testing asynchronous GPs.\n", __func__);
+ } else if (gp_normal && !cur_ops->deferred_free) {
+ pr_alert("%s: gp_normal without primitives.\n", __func__);
+ }
+ if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
- else if (gp_sync && !cur_ops->sync)
- pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
+ pr_info("%s: Testing normal GPs.\n", __func__);
+ } else if (gp_sync && !cur_ops->sync) {
+ pr_alert("%s: gp_sync without primitives.\n", __func__);
+ }
if (WARN_ONCE(nsynctypes == 0,
"rcu_torture_writer: No update-side primitives.\n")) {
/*
@@ -1011,6 +1020,9 @@ rcu_torture_writer(void *arg)
rcu_unexpedite_gp();
if (++expediting > 3)
expediting = -expediting;
+ } else if (!can_expedite) { /* Disabled during boot, recheck. */
+ can_expedite = !rcu_gp_is_expedited() &&
+ !rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
stutter_wait("rcu_torture_writer");
@@ -1021,6 +1033,10 @@ rcu_torture_writer(void *arg)
while (can_expedite && expediting++ < 0)
rcu_unexpedite_gp();
WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
+ if (!can_expedite)
+ pr_alert("%s" TORTURE_FLAG
+ " Dynamic grace-period expediting was disabled.\n",
+ torture_type);
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
return 0;
@@ -1045,13 +1061,13 @@ rcu_torture_fakewriter(void *arg)
torture_random(&rand) % (nfakewriters * 8) == 0) {
cur_ops->cb_barrier();
} else if (gp_normal == gp_exp) {
- if (torture_random(&rand) & 0x80)
+ if (cur_ops->sync && torture_random(&rand) & 0x80)
cur_ops->sync();
- else
+ else if (cur_ops->exp_sync)
cur_ops->exp_sync();
- } else if (gp_normal) {
+ } else if (gp_normal && cur_ops->sync) {
cur_ops->sync();
- } else {
+ } else if (cur_ops->exp_sync) {
cur_ops->exp_sync();
}
stutter_wait("rcu_torture_fakewriter");
@@ -1557,11 +1573,10 @@ static int rcu_torture_barrier_init(void)
atomic_set(&barrier_cbs_count, 0);
atomic_set(&barrier_cbs_invoked, 0);
barrier_cbs_tasks =
- kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+ kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]),
GFP_KERNEL);
barrier_cbs_wq =
- kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
- GFP_KERNEL);
+ kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL);
if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
return -ENOMEM;
for (i = 0; i < n_barrier_cbs; i++) {
@@ -1599,6 +1614,9 @@ static enum cpuhp_state rcutor_hp;
static void
rcu_torture_cleanup(void)
{
+ int flags = 0;
+ unsigned long gpnum = 0;
+ unsigned long completed = 0;
int i;
rcutorture_record_test_transition();
@@ -1629,6 +1647,11 @@ rcu_torture_cleanup(void)
fakewriter_tasks = NULL;
}
+ rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed);
+ srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
+ &flags, &gpnum, &completed);
+ pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n",
+ cur_ops->name, gpnum, completed, flags);
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
for (i = 0; i < ncbflooders; i++)
@@ -1674,7 +1697,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
* next grace period. Unlikely, but can happen. If it
* does happen, the debug-objects subsystem won't have splatted.
*/
- pr_alert("rcutorture: duplicated callback was invoked.\n");
+ pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME);
}
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
@@ -1691,7 +1714,7 @@ static void rcu_test_debug_objects(void)
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
- pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
+ pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME);
/* Try to queue the rh2 pair of callbacks for the same grace period. */
preempt_disable(); /* Prevent preemption from interrupting test. */
@@ -1706,11 +1729,11 @@ static void rcu_test_debug_objects(void)
/* Wait for them all to get done so we can safely return. */
rcu_barrier();
- pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
+ pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
- pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
+ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
}
@@ -1799,7 +1822,7 @@ rcu_torture_init(void)
if (firsterr)
goto unwind;
if (nfakewriters > 0) {
- fakewriter_tasks = kzalloc(nfakewriters *
+ fakewriter_tasks = kcalloc(nfakewriters,
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
@@ -1814,7 +1837,7 @@ rcu_torture_init(void)
if (firsterr)
goto unwind;
}
- reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
+ reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("out of memory");
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 76ac5f50b2c7..622792abe41a 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
* Must invoke this after you are finished using a given srcu_struct that
* was initialized via init_srcu_struct(), else you leak memory.
*/
-void cleanup_srcu_struct(struct srcu_struct *sp)
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
{
WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
- flush_work(&sp->srcu_work);
+ if (quiesced)
+ WARN_ON(work_pending(&sp->srcu_work));
+ else
+ flush_work(&sp->srcu_work);
WARN_ON(sp->srcu_gp_running);
WARN_ON(sp->srcu_gp_waiting);
WARN_ON(sp->srcu_cb_head);
WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
/*
* Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d5cea81378cc..b4123d7a2cec 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -366,33 +366,37 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
return SRCU_INTERVAL;
}
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
{
int cpu;
if (WARN_ON(!srcu_get_delay(sp)))
- return; /* Leakage unless caller handles error. */
+ return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(sp)))
- return; /* Leakage unless caller handles error. */
- flush_delayed_work(&sp->work);
+ return; /* Just leak it! */
+ if (quiesced) {
+ if (WARN_ON(delayed_work_pending(&sp->work)))
+ return; /* Just leak it! */
+ } else {
+ flush_delayed_work(&sp->work);
+ }
for_each_possible_cpu(cpu)
- flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+ if (quiesced) {
+ if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
+ return; /* Just leak it! */
+ } else {
+ flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+ }
if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
WARN_ON(srcu_readers_active(sp))) {
- pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+ pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
return; /* Caller forgot to stop doing call_srcu()? */
}
free_percpu(sp->sda);
sp->sda = NULL;
}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
/*
* Counts the new reader in the appropriate per-CPU element of the
@@ -439,7 +443,7 @@ static void srcu_gp_start(struct srcu_struct *sp)
struct srcu_data *sdp = this_cpu_ptr(sp->sda);
int state;
- lockdep_assert_held(&sp->lock);
+ lockdep_assert_held(&ACCESS_PRIVATE(sp, lock));
WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&sp->srcu_gp_seq));
@@ -492,8 +496,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
*/
static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
{
- srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
- &sdp->work, delay);
+ srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay);
}
/*
@@ -527,11 +530,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
{
unsigned long cbdelay;
bool cbs;
+ bool last_lvl;
int cpu;
unsigned long flags;
unsigned long gpseq;
int idx;
- int idxnext;
unsigned long mask;
struct srcu_data *sdp;
struct srcu_node *snp;
@@ -555,11 +558,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
/* Initiate callback invocation as needed. */
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
- idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
rcu_for_each_node_breadth_first(sp, snp) {
spin_lock_irq_rcu_node(snp);
cbs = false;
- if (snp >= sp->level[rcu_num_lvls - 1])
+ last_lvl = snp >= sp->level[rcu_num_lvls - 1];
+ if (last_lvl)
cbs = snp->srcu_have_cbs[idx] == gpseq;
snp->srcu_have_cbs[idx] = gpseq;
rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
@@ -572,13 +575,16 @@ static void srcu_gp_end(struct srcu_struct *sp)
srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
/* Occasionally prevent srcu_data counter wrap. */
- if (!(gpseq & counter_wrap_check))
+ if (!(gpseq & counter_wrap_check) && last_lvl)
for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
sdp = per_cpu_ptr(sp->sda, cpu);
spin_lock_irqsave_rcu_node(sdp, flags);
if (ULONG_CMP_GE(gpseq,
sdp->srcu_gp_seq_needed + 100))
sdp->srcu_gp_seq_needed = gpseq;
+ if (ULONG_CMP_GE(gpseq,
+ sdp->srcu_gp_seq_needed_exp + 100))
+ sdp->srcu_gp_seq_needed_exp = gpseq;
spin_unlock_irqrestore_rcu_node(sdp, flags);
}
}
@@ -593,9 +599,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
srcu_gp_start(sp);
spin_unlock_irq_rcu_node(sp);
- /* Throttle expedited grace periods: Should be rare! */
- srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
- ? 0 : SRCU_INTERVAL);
+ srcu_reschedule(sp, 0);
} else {
spin_unlock_irq_rcu_node(sp);
}
@@ -626,7 +630,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
spin_unlock_irqrestore_rcu_node(snp, flags);
}
spin_lock_irqsave_rcu_node(sp, flags);
- if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+ if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
sp->srcu_gp_seq_needed_exp = s;
spin_unlock_irqrestore_rcu_node(sp, flags);
}
@@ -691,8 +695,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
srcu_gp_start(sp);
- queue_delayed_work(system_power_efficient_wq, &sp->work,
- srcu_get_delay(sp));
+ queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp));
}
spin_unlock_irqrestore_rcu_node(sp, flags);
}
@@ -1225,7 +1228,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
spin_unlock_irq_rcu_node(sp);
if (pushgp)
- queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+ queue_delayed_work(rcu_gp_wq, &sp->work, delay);
}
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 491bdf39f276..aa7cade1b9f3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644);
static ulong jiffies_till_sched_qs = HZ / 10;
module_param(jiffies_till_sched_qs, ulong, 0444);
-static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(void);
@@ -711,44 +709,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
}
/*
- * Is there any need for future grace periods?
- * Interrupts must be disabled. If the caller does not hold the root
- * rnp_node structure's ->lock, the results are advisory only.
- */
-static int rcu_future_needs_gp(struct rcu_state *rsp)
-{
- struct rcu_node *rnp = rcu_get_root(rsp);
- int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
- int *fp = &rnp->need_future_gp[idx];
-
- lockdep_assert_irqs_disabled();
- return READ_ONCE(*fp);
-}
-
-/*
- * Does the current CPU require a not-yet-started grace period?
- * The caller must have disabled interrupts to prevent races with
- * normal callback registry.
- */
-static bool
-cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (rcu_gp_in_progress(rsp))
- return false; /* No, a grace period is already in progress. */
- if (rcu_future_needs_gp(rsp))
- return true; /* Yes, a no-CBs CPU needs one. */
- if (!rcu_segcblist_is_enabled(&rdp->cblist))
- return false; /* No, this is a no-CBs (or offline) CPU. */
- if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
- return true; /* Yes, CPU has newly registered callbacks. */
- if (rcu_segcblist_future_gp_needed(&rdp->cblist,
- READ_ONCE(rsp->completed)))
- return true; /* Yes, CBs for future grace period. */
- return false; /* No grace period needed. */
-}
-
-/*
* Enter an RCU extended quiescent state, which can be either the
* idle loop or adaptive-tickless usermode execution.
*
@@ -1161,7 +1121,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
*/
static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
{
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum))
WRITE_ONCE(rdp->gpwrap, true);
if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum))
@@ -1234,10 +1194,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
}
/*
- * Has this CPU encountered a cond_resched_rcu_qs() since the
- * beginning of the grace period? For this to be the case,
- * the CPU has to have noticed the current grace period. This
- * might not be the case for nohz_full CPUs looping in the kernel.
+ * Has this CPU encountered a cond_resched() since the beginning
+ * of the grace period? For this to be the case, the CPU has to
+ * have noticed the current grace period. This might not be the
+ * case for nohz_full CPUs looping in the kernel.
*/
jtsq = jiffies_till_sched_qs;
ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
@@ -1350,6 +1310,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
if (rsp->gp_kthread) {
+ pr_err("RCU grace-period kthread stack dump:\n");
sched_show_task(rsp->gp_kthread);
wake_up_process(rsp->gp_kthread);
}
@@ -1628,7 +1589,7 @@ void rcu_cpu_stall_reset(void)
static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
struct rcu_node *rnp)
{
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/*
* If RCU is idle, we just wait for the next grace period.
@@ -1641,18 +1602,30 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
return rnp->completed + 1;
/*
+ * If the current rcu_node structure believes that RCU is
+ * idle, and if the rcu_state structure does not yet reflect
+ * the start of a new grace period, then the next grace period
+ * will suffice. The memory barrier is needed to accurately
+ * sample the rsp->gpnum, and pairs with the second lock
+ * acquisition in rcu_gp_init(), which is augmented with
+ * smp_mb__after_unlock_lock() for this purpose.
+ */
+ if (rnp->gpnum == rnp->completed) {
+ smp_mb(); /* See above block comment. */
+ if (READ_ONCE(rsp->gpnum) == rnp->completed)
+ return rnp->completed + 1;
+ }
+
+ /*
* Otherwise, wait for a possible partial grace period and
* then the subsequent full grace period.
*/
return rnp->completed + 2;
}
-/*
- * Trace-event helper function for rcu_start_future_gp() and
- * rcu_nocb_wait_gp().
- */
-static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long c, const char *s)
+/* Trace-event wrapper function for trace_rcu_future_grace_period. */
+static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c, const char *s)
{
trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
rnp->completed, c, rnp->level,
@@ -1660,96 +1633,67 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
}
/*
- * Start some future grace period, as needed to handle newly arrived
+ * Start the specified grace period, as needed to handle newly arrived
* callbacks. The required future grace periods are recorded in each
- * rcu_node structure's ->need_future_gp field. Returns true if there
+ * rcu_node structure's ->need_future_gp[] field. Returns true if there
* is reason to awaken the grace-period kthread.
*
- * The caller must hold the specified rcu_node structure's ->lock.
+ * The caller must hold the specified rcu_node structure's ->lock, which
+ * is why the caller is responsible for waking the grace-period kthread.
*/
-static bool __maybe_unused
-rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long *c_out)
+static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c)
{
- unsigned long c;
bool ret = false;
- struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
-
- lockdep_assert_held(&rnp->lock);
-
- /*
- * Pick up grace-period number for new callbacks. If this
- * grace period is already marked as needed, return to the caller.
- */
- c = rcu_cbs_completed(rdp->rsp, rnp);
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
- if (rnp->need_future_gp[c & 0x1]) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
- goto out;
- }
+ struct rcu_state *rsp = rdp->rsp;
+ struct rcu_node *rnp_root;
/*
- * If either this rcu_node structure or the root rcu_node structure
- * believe that a grace period is in progress, then we must wait
- * for the one following, which is in "c". Because our request
- * will be noticed at the end of the current grace period, we don't
- * need to explicitly start one. We only do the lockless check
- * of rnp_root's fields if the current rcu_node structure thinks
- * there is no grace period in flight, and because we hold rnp->lock,
- * the only possible change is when rnp_root's two fields are
- * equal, in which case rnp_root->gpnum might be concurrently
- * incremented. But that is OK, as it will just result in our
- * doing some extra useless work.
+ * Use funnel locking to either acquire the root rcu_node
+ * structure's lock or bail out if the need for this grace period
+ * has already been recorded -- or has already started. If there
+ * is already a grace period in progress in a non-leaf node, no
+ * recording is needed because the end of the grace period will
+ * scan the leaf rcu_node structures. Note that rnp->lock must
+ * not be released.
*/
- if (rnp->gpnum != rnp->completed ||
- READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
- rnp->need_future_gp[c & 0x1]++;
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
- goto out;
+ raw_lockdep_assert_held_rcu_node(rnp);
+ trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf"));
+ for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) {
+ if (rnp_root != rnp)
+ raw_spin_lock_rcu_node(rnp_root);
+ WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum +
+ need_future_gp_mask(), c));
+ if (need_future_gp_element(rnp_root, c) ||
+ ULONG_CMP_GE(rnp_root->gpnum, c) ||
+ (rnp != rnp_root &&
+ rnp_root->gpnum != rnp_root->completed)) {
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted"));
+ goto unlock_out;
+ }
+ need_future_gp_element(rnp_root, c) = true;
+ if (rnp_root != rnp && rnp_root->parent != NULL)
+ raw_spin_unlock_rcu_node(rnp_root);
+ if (!rnp_root->parent)
+ break; /* At root, and perhaps also leaf. */
}
- /*
- * There might be no grace period in progress. If we don't already
- * hold it, acquire the root rcu_node structure's lock in order to
- * start one (if needed).
- */
- if (rnp != rnp_root)
- raw_spin_lock_rcu_node(rnp_root);
-
- /*
- * Get a new grace-period number. If there really is no grace
- * period in progress, it will be smaller than the one we obtained
- * earlier. Adjust callbacks as needed.
- */
- c = rcu_cbs_completed(rdp->rsp, rnp_root);
- if (!rcu_is_nocb_cpu(rdp->cpu))
- (void)rcu_segcblist_accelerate(&rdp->cblist, c);
-
- /*
- * If the needed for the required grace period is already
- * recorded, trace and leave.
- */
- if (rnp_root->need_future_gp[c & 0x1]) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
+ /* If GP already in progress, just leave, otherwise start one. */
+ if (rnp_root->gpnum != rnp_root->completed) {
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot"));
goto unlock_out;
}
-
- /* Record the need for the future grace period. */
- rnp_root->need_future_gp[c & 0x1]++;
-
- /* If a grace period is not already in progress, start one. */
- if (rnp_root->gpnum != rnp_root->completed) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
- } else {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
- ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot"));
+ WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT);
+ if (!rsp->gp_kthread) {
+ trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread"));
+ goto unlock_out;
}
+ trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq"));
+ ret = true; /* Caller must wake GP kthread. */
unlock_out:
if (rnp != rnp_root)
raw_spin_unlock_rcu_node(rnp_root);
-out:
- if (c_out != NULL)
- *c_out = c;
return ret;
}
@@ -1757,16 +1701,16 @@ out:
* Clean up any old requests for the just-ended grace period. Also return
* whether any additional grace periods have been requested.
*/
-static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
- int c = rnp->completed;
- int needmore;
+ unsigned long c = rnp->completed;
+ bool needmore;
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- rnp->need_future_gp[c & 0x1] = 0;
- needmore = rnp->need_future_gp[(c + 1) & 0x1];
- trace_rcu_future_gp(rnp, rdp, c,
- needmore ? TPS("CleanupMore") : TPS("Cleanup"));
+ need_future_gp_element(rnp, c) = false;
+ needmore = need_any_future_gp(rnp);
+ trace_rcu_this_gp(rnp, rdp, c,
+ needmore ? TPS("CleanupMore") : TPS("Cleanup"));
return needmore;
}
@@ -1801,9 +1745,10 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
+ unsigned long c;
bool ret = false;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
@@ -1819,8 +1764,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
* accelerating callback invocation to an earlier grace-period
* number.
*/
- if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
- ret = rcu_start_future_gp(rnp, rdp, NULL);
+ c = rcu_cbs_completed(rsp, rnp);
+ if (rcu_segcblist_accelerate(&rdp->cblist, c))
+ ret = rcu_start_this_gp(rnp, rdp, c);
/* Trace depending on how much we were able to accelerate. */
if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
@@ -1843,7 +1789,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
@@ -1871,7 +1817,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
bool ret;
bool need_gp;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
@@ -2048,7 +1994,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq_rcu_node(rnp);
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
}
@@ -2107,7 +2053,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
{
unsigned long gp_duration;
bool needgp = false;
- int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
struct swait_queue_head *sq;
@@ -2146,31 +2091,35 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
- nocb += rcu_future_gp_cleanup(rsp, rnp);
+ needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp;
sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq_rcu_node(rnp);
rcu_nocb_gp_cleanup(sq);
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
rcu_gp_slow(rsp, gp_cleanup_delay);
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
- rcu_nocb_gp_set(rnp, nocb);
/* Declare grace period done. */
WRITE_ONCE(rsp->completed, rsp->gpnum);
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->gp_state = RCU_GP_IDLE;
+ /* Check for GP requests since above loop. */
rdp = this_cpu_ptr(rsp->rda);
+ if (need_any_future_gp(rnp)) {
+ trace_rcu_this_gp(rnp, rdp, rsp->completed - 1,
+ TPS("CleanupMore"));
+ needgp = true;
+ }
/* Advance CBs to reduce false positives below. */
- needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
- if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+ if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) {
WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
- trace_rcu_grace_period(rsp->name,
- READ_ONCE(rsp->gpnum),
+ trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
TPS("newreq"));
}
+ WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT);
raw_spin_unlock_irq_rcu_node(rnp);
}
@@ -2201,7 +2150,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
@@ -2246,7 +2195,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum),
TPS("fqsend"));
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
ret = 0; /* Force full wait till next FQS. */
j = jiffies_till_next_fqs;
@@ -2259,7 +2208,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
} else {
/* Deal with stray signal. */
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
@@ -2282,71 +2231,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
/*
- * Start a new RCU grace period if warranted, re-initializing the hierarchy
- * in preparation for detecting the next grace period. The caller must hold
- * the root node's ->lock and hard irqs must be disabled.
- *
- * Note that it is legal for a dying CPU (which is marked as offline) to
- * invoke this function. This can happen when the dying CPU reports its
- * quiescent state.
- *
- * Returns true if the grace-period kthread must be awakened.
- */
-static bool
-rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- lockdep_assert_held(&rnp->lock);
- if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
- /*
- * Either we have not yet spawned the grace-period
- * task, this CPU does not need another grace period,
- * or a grace period is already in progress.
- * Either way, don't start a new grace period.
- */
- return false;
- }
- WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
- trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
- TPS("newreq"));
-
- /*
- * We can't do wakeups while holding the rnp->lock, as that
- * could cause possible deadlocks with the rq->lock. Defer
- * the wakeup to our caller.
- */
- return true;
-}
-
-/*
- * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
- * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
- * is invoked indirectly from rcu_advance_cbs(), which would result in
- * endless recursion -- or would do so if it wasn't for the self-deadlock
- * that is encountered beforehand.
- *
- * Returns true if the grace-period kthread needs to be awakened.
- */
-static bool rcu_start_gp(struct rcu_state *rsp)
-{
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_node *rnp = rcu_get_root(rsp);
- bool ret = false;
-
- /*
- * If there is no grace period in progress right now, any
- * callbacks we have up to this point will be satisfied by the
- * next grace period. Also, advancing the callbacks reduces the
- * probability of false positives from cpu_needs_another_gp()
- * resulting in pointless grace periods. So, advance callbacks
- * then start the grace period!
- */
- ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
- ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
- return ret;
-}
-
-/*
* Report a full set of quiescent states to the specified rcu_state data
* structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period
* kthread if another grace period is required. Whether we wake
@@ -2358,7 +2242,7 @@ static bool rcu_start_gp(struct rcu_state *rsp)
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
- lockdep_assert_held(&rcu_get_root(rsp)->lock);
+ raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp));
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
@@ -2383,7 +2267,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
unsigned long oldmask = 0;
struct rcu_node *rnp_c;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
/* Walk up the rcu_node hierarchy. */
for (;;) {
@@ -2397,7 +2281,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
- WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
rcu_preempt_blocked_readers_cgp(rnp));
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
@@ -2447,7 +2331,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
unsigned long mask;
struct rcu_node *rnp_p;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2592,7 +2476,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
return;
@@ -2691,7 +2575,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
smp_mb(); /* List handling before counting for rcu_barrier(). */
- rdp->n_cbs_invoked += count;
rcu_segcblist_insert_count(&rdp->cblist, &rcl);
/* Reinstate batch limit if we have worked down the excess. */
@@ -2782,7 +2665,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
struct rcu_node *rnp;
rcu_for_each_leaf_node(rsp, rnp) {
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
@@ -2845,10 +2728,8 @@ static void force_quiescent_state(struct rcu_state *rsp)
!raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
- if (ret) {
- rsp->n_force_qs_lh++;
+ if (ret)
return;
- }
rnp_old = rnp;
}
/* rnp_old == rcu_get_root(rsp), rnp == NULL. */
@@ -2857,7 +2738,6 @@ static void force_quiescent_state(struct rcu_state *rsp)
raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
raw_spin_unlock(&rnp_old->fqslock);
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- rsp->n_force_qs_lh++;
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
return; /* Someone beat us to it. */
}
@@ -2877,22 +2757,27 @@ __rcu_process_callbacks(struct rcu_state *rsp)
unsigned long flags;
bool needwake;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp;
WARN_ON_ONCE(!rdp->beenonline);
/* Update RCU state based on any recent quiescent states. */
rcu_check_quiescent_state(rsp, rdp);
- /* Does this CPU require a not-yet-started grace period? */
- local_irq_save(flags);
- if (cpu_needs_another_gp(rsp, rdp)) {
- raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
- needwake = rcu_start_gp(rsp);
- raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
- if (needwake)
- rcu_gp_kthread_wake(rsp);
- } else {
- local_irq_restore(flags);
+ /* No grace period and unregistered callbacks? */
+ if (!rcu_gp_in_progress(rsp) &&
+ rcu_segcblist_is_enabled(&rdp->cblist)) {
+ local_irq_save(flags);
+ if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) {
+ local_irq_restore(flags);
+ } else {
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ }
}
/* If there are callbacks ready, invoke them. */
@@ -2976,11 +2861,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
/* Start a new grace period if one not already started. */
if (!rcu_gp_in_progress(rsp)) {
- struct rcu_node *rnp_root = rcu_get_root(rsp);
+ struct rcu_node *rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp_root);
- needwake = rcu_start_gp(rsp);
- raw_spin_unlock_rcu_node(rnp_root);
+ raw_spin_lock_rcu_node(rnp);
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+ raw_spin_unlock_rcu_node(rnp);
if (needwake)
rcu_gp_kthread_wake(rsp);
} else {
@@ -3355,8 +3240,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
{
struct rcu_node *rnp = rdp->mynode;
- rdp->n_rcu_pending++;
-
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rsp, rdp);
@@ -3365,48 +3248,33 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
return 0;
/* Is the RCU core waiting for a quiescent state from this CPU? */
- if (rcu_scheduler_fully_active &&
- rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
- rdp->n_rp_core_needs_qs++;
- } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
- rdp->n_rp_report_qs++;
+ if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm)
return 1;
- }
/* Does this CPU have callbacks ready to invoke? */
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- rdp->n_rp_cb_ready++;
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
return 1;
- }
/* Has RCU gone idle with this CPU needing another grace period? */
- if (cpu_needs_another_gp(rsp, rdp)) {
- rdp->n_rp_cpu_needs_gp++;
+ if (!rcu_gp_in_progress(rsp) &&
+ rcu_segcblist_is_enabled(&rdp->cblist) &&
+ !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
- }
/* Has another RCU grace period completed? */
- if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
- rdp->n_rp_gp_completed++;
+ if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */
return 1;
- }
/* Has a new RCU grace period started? */
if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
- unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */
- rdp->n_rp_gp_started++;
+ unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
return 1;
- }
/* Does this CPU need a deferred NOCB wakeup? */
- if (rcu_nocb_need_deferred_wakeup(rdp)) {
- rdp->n_rp_nocb_defer_wakeup++;
+ if (rcu_nocb_need_deferred_wakeup(rdp))
return 1;
- }
/* nothing to do */
- rdp->n_rp_need_nothing++;
return 0;
}
@@ -3618,7 +3486,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
for (;;) {
mask = rnp->grpmask;
rnp = rnp->parent;
@@ -3636,12 +3504,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
static void __init
rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
{
- unsigned long flags;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1);
@@ -3649,7 +3514,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->cpu = cpu;
rdp->rsp = rsp;
rcu_boot_init_nocb_percpu_data(rdp);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/*
@@ -3801,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu)
return 0;
}
+static DEFINE_PER_CPU(int, rcu_cpu_started);
+
/*
* Mark the specified CPU as being online so that subsequent grace periods
* (both expedited and normal) will wait on it. Note that this means that
@@ -3822,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu)
struct rcu_node *rnp;
struct rcu_state *rsp;
+ if (per_cpu(rcu_cpu_started, cpu))
+ return;
+
+ per_cpu(rcu_cpu_started, cpu) = 1;
+
for_each_rcu_flavor(rsp) {
rdp = per_cpu_ptr(rsp->rda, cpu);
rnp = rdp->mynode;
@@ -3878,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu)
preempt_enable();
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_idle_cpu(cpu, rsp);
+
+ per_cpu(rcu_cpu_started, cpu) = 0;
}
/* Migrate the dead CPU's callbacks to the current CPU. */
@@ -3887,6 +3760,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
struct rcu_data *my_rdp;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+ bool needwake;
if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
@@ -3898,12 +3772,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
return;
}
raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
- rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
- rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+ /* Leverage recent GPs and set GP for new callbacks. */
+ needwake = rcu_advance_cbs(rsp, rnp_root, rdp) ||
+ rcu_advance_cbs(rsp, rnp_root, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
!rcu_segcblist_n_cbs(&my_rdp->cblist));
raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
!rcu_segcblist_empty(&rdp->cblist),
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
@@ -4082,7 +3959,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
init_swait_queue_head(&rsp->gp_wq);
init_swait_queue_head(&rsp->expedited_wq);
- rnp = rsp->level[rcu_num_lvls - 1];
+ rnp = rcu_first_leaf_node(rsp);
for_each_possible_cpu(i) {
while (i > rnp->grphi)
rnp++;
@@ -4193,6 +4070,9 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
pr_cont("\n");
}
+struct workqueue_struct *rcu_gp_wq;
+struct workqueue_struct *rcu_par_gp_wq;
+
void __init rcu_init(void)
{
int cpu;
@@ -4219,6 +4099,12 @@ void __init rcu_init(void)
rcu_cpu_starting(cpu);
rcutree_online_cpu(cpu);
}
+
+ /* Create workqueue for expedited GPs and for Tree SRCU. */
+ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_gp_wq);
+ rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_par_gp_wq);
}
#include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 6488a3b0e729..78e051dffc5b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -58,6 +58,14 @@ struct rcu_dynticks {
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
};
+/* Communicate arguments to a workqueue handler. */
+struct rcu_exp_work {
+ smp_call_func_t rew_func;
+ struct rcu_state *rew_rsp;
+ unsigned long rew_s;
+ struct work_struct rew_work;
+};
+
/* RCU's kthread states for tracing. */
#define RCU_KTHREAD_STOPPED 0
#define RCU_KTHREAD_RUNNING 1
@@ -146,25 +154,36 @@ struct rcu_node {
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
- unsigned long n_tasks_boosted;
- /* Total number of tasks boosted. */
- unsigned long n_exp_boosts;
- /* Number of tasks boosted for expedited GP. */
- unsigned long n_normal_boosts;
- /* Number of tasks boosted for normal GP. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
- int need_future_gp[2];
- /* Counts of upcoming no-CB GP requests. */
+ u8 need_future_gp[4]; /* Counts of upcoming GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
unsigned long exp_seq_rq;
wait_queue_head_t exp_wq[4];
+ struct rcu_exp_work rew;
+ bool exp_need_flush; /* Need to flush workitem? */
} ____cacheline_internodealigned_in_smp;
+/* Accessors for ->need_future_gp[] array. */
+#define need_future_gp_mask() \
+ (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1)
+#define need_future_gp_element(rnp, c) \
+ ((rnp)->need_future_gp[(c) & need_future_gp_mask()])
+#define need_any_future_gp(rnp) \
+({ \
+ int __i; \
+ bool __nonzero = false; \
+ \
+ for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \
+ __nonzero = __nonzero || \
+ READ_ONCE((rnp)->need_future_gp[__i]); \
+ __nonzero; \
+})
+
/*
* Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
* are indexed relative to this interval rather than the global CPU ID space.
@@ -184,13 +203,6 @@ union rcu_noqs {
u16 s; /* Set of bits, aggregate OR here. */
};
-/* Index values for nxttail array in struct rcu_data. */
-#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
-#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
-#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
-#define RCU_NEXT_TAIL 3
-#define RCU_NEXT_SIZE 4
-
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
@@ -217,8 +229,6 @@ struct rcu_data {
/* different grace periods. */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
- unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
- unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -234,29 +244,14 @@ struct rcu_data {
/* Grace period that needs help */
/* from cond_resched(). */
- /* 5) __rcu_pending() statistics. */
- unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
- unsigned long n_rp_core_needs_qs;
- unsigned long n_rp_report_qs;
- unsigned long n_rp_cb_ready;
- unsigned long n_rp_cpu_needs_gp;
- unsigned long n_rp_gp_completed;
- unsigned long n_rp_gp_started;
- unsigned long n_rp_nocb_defer_wakeup;
- unsigned long n_rp_need_nothing;
-
- /* 6) _rcu_barrier(), OOM callbacks, and expediting. */
+ /* 5) _rcu_barrier(), OOM callbacks, and expediting. */
struct rcu_head barrier_head;
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
- atomic_long_t exp_workdone0; /* # done by workqueue. */
- atomic_long_t exp_workdone1; /* # done by others #1. */
- atomic_long_t exp_workdone2; /* # done by others #2. */
- atomic_long_t exp_workdone3; /* # done by others #3. */
int exp_dynticks_snap; /* Double-check need for IPI. */
- /* 7) Callback offloading. */
+ /* 6) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
struct rcu_head *nocb_head; /* CBs waiting for kthread. */
struct rcu_head **nocb_tail;
@@ -283,7 +278,7 @@ struct rcu_data {
/* Leader CPU takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
- /* 8) RCU CPU stall data. */
+ /* 7) RCU CPU stall data. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
/* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
struct irq_work rcu_iw; /* Check for non-irq activity. */
@@ -374,10 +369,6 @@ struct rcu_state {
/* kthreads, if configured. */
unsigned long n_force_qs; /* Number of calls to */
/* force_quiescent_state(). */
- unsigned long n_force_qs_lh; /* ~Number of calls leaving */
- /* due to lock unavailable. */
- unsigned long n_force_qs_ngp; /* Number of calls leaving */
- /* due to no GP active. */
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
unsigned long gp_activity; /* Time of last GP kthread */
@@ -438,7 +429,6 @@ extern struct rcu_state rcu_preempt_state;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
-bool rcu_eqs_special_set(int cpu);
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -468,7 +458,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
#ifdef CONFIG_RCU_BOOST
-static void rcu_preempt_do_callbacks(void);
static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -484,7 +473,6 @@ static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 46d61b597731..d40708e8c5d6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -20,6 +20,8 @@
* Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
+#include <linux/lockdep.h>
+
/*
* Record the start of an expedited grace period.
*/
@@ -29,6 +31,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
}
/*
+ * Return then value that expedited-grace-period counter will have
+ * at the end of the current grace period.
+ */
+static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp)
+{
+ return rcu_seq_endval(&rsp->expedited_sequence);
+}
+
+/*
* Record the end of an expedited grace period.
*/
static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
@@ -145,15 +156,35 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
* for the current expedited grace period. Works only for preemptible
* RCU -- other RCU implementation use other means.
*
- * Caller must hold the rcu_state's exp_mutex.
+ * Caller must hold the specificed rcu_node structure's ->lock
*/
static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
+ raw_lockdep_assert_held_rcu_node(rnp);
+
return rnp->exp_tasks == NULL &&
READ_ONCE(rnp->expmask) == 0;
}
/*
+ * Like sync_rcu_preempt_exp_done(), but this function assumes the caller
+ * doesn't hold the rcu_node's ->lock, and will acquire and release the lock
+ * itself
+ */
+static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ bool ret;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ ret = sync_rcu_preempt_exp_done(rnp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ return ret;
+}
+
+
+/*
* Report the exit from RCU read-side critical section for the last task
* that queued itself during or before the current expedited preemptible-RCU
* grace period. This event is reported either to the rcu_node structure on
@@ -161,8 +192,7 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
- * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
- * structure's ->lock.
+ * Caller must hold the specified rcu_node structure's ->lock.
*/
static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake, unsigned long flags)
@@ -198,8 +228,6 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
/*
* Report expedited quiescent state for specified node. This is a
* lock-acquisition wrapper function for __rcu_report_exp_rnp().
- *
- * Caller must hold the rcu_state's exp_mutex.
*/
static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
struct rcu_node *rnp, bool wake)
@@ -212,8 +240,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
/*
* Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure. Caller must hold the rcu_state's
- * exp_mutex.
+ * specified leaf rcu_node structure.
*/
static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
unsigned long mask, bool wake)
@@ -239,14 +266,12 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
}
/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
- unsigned long s)
+static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s)
{
if (rcu_exp_gp_seq_done(rsp, s)) {
trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
/* Ensure test happens before caller kfree(). */
smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(stat);
return true;
}
return false;
@@ -280,7 +305,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
* promoting locality and is not strictly needed for correctness.
*/
for (; rnp != NULL; rnp = rnp->parent) {
- if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+ if (sync_exp_work_done(rsp, s))
return true;
/* Work not done, either wait here or go up. */
@@ -293,8 +318,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
rnp->grplo, rnp->grphi,
TPS("wait"));
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
- sync_exp_work_done(rsp,
- &rdp->exp_workdone2, s));
+ sync_exp_work_done(rsp, s));
return true;
}
rnp->exp_seq_rq = s; /* Followers can wait on us. */
@@ -304,7 +328,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
}
mutex_lock(&rsp->exp_mutex);
fastpath:
- if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+ if (sync_exp_work_done(rsp, s)) {
mutex_unlock(&rsp->exp_mutex);
return true;
}
@@ -353,83 +377,129 @@ static void sync_sched_exp_online_cleanup(int cpu)
}
/*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
+ * Select the CPUs within the specified rcu_node that the upcoming
+ * expedited grace period needs to wait for.
*/
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
- smp_call_func_t func)
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
{
int cpu;
unsigned long flags;
+ smp_call_func_t func;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
- struct rcu_node *rnp;
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+ struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
+ struct rcu_state *rsp = rewp->rew_rsp;
- sync_exp_reset_tree(rsp);
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ func = rewp->rew_func;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
- /* Each pass checks a CPU for identity, offline, and idle. */
- mask_ofl_test = 0;
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-
- rdp->exp_dynticks_snap =
- rcu_dynticks_snap(rdp->dynticks);
- if (raw_smp_processor_id() == cpu ||
- rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) ||
- !(rnp->qsmaskinitnext & rdp->grpmask))
- mask_ofl_test |= rdp->grpmask;
+ /* Each pass checks a CPU for identity, offline, and idle. */
+ mask_ofl_test = 0;
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+ int snap;
+
+ if (raw_smp_processor_id() == cpu ||
+ !(rnp->qsmaskinitnext & mask)) {
+ mask_ofl_test |= mask;
+ } else {
+ snap = rcu_dynticks_snap(rdtp);
+ if (rcu_dynticks_in_eqs(snap))
+ mask_ofl_test |= mask;
+ else
+ rdp->exp_dynticks_snap = snap;
}
- mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
- /*
- * Need to wait for any blocked tasks as well. Note that
- * additional blocking tasks will also block the expedited
- * GP until such time as the ->expmask bits are cleared.
- */
- if (rcu_preempt_has_tasks(rnp))
- rnp->exp_tasks = rnp->blkd_tasks.next;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
- /* IPI the remaining CPUs for expedited quiescent state. */
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ /*
+ * Need to wait for any blocked tasks as well. Note that
+ * additional blocking tasks will also block the expedited GP
+ * until such time as the ->expmask bits are cleared.
+ */
+ if (rcu_preempt_has_tasks(rnp))
+ rnp->exp_tasks = rnp->blkd_tasks.next;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (!(mask_ofl_ipi & mask))
- continue;
+ /* IPI the remaining CPUs for expedited quiescent state. */
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (!(mask_ofl_ipi & mask))
+ continue;
retry_ipi:
- if (rcu_dynticks_in_eqs_since(rdp->dynticks,
- rdp->exp_dynticks_snap)) {
- mask_ofl_test |= mask;
- continue;
- }
- ret = smp_call_function_single(cpu, func, rsp, 0);
- if (!ret) {
- mask_ofl_ipi &= ~mask;
- continue;
- }
- /* Failed, raced with CPU hotplug operation. */
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if ((rnp->qsmaskinitnext & mask) &&
- (rnp->expmask & mask)) {
- /* Online, so delay for a bit and try again. */
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- schedule_timeout_uninterruptible(1);
- goto retry_ipi;
- }
- /* CPU really is offline, so we can ignore it. */
- if (!(rnp->expmask & mask))
- mask_ofl_ipi &= ~mask;
+ if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+ rdp->exp_dynticks_snap)) {
+ mask_ofl_test |= mask;
+ continue;
+ }
+ ret = smp_call_function_single(cpu, func, rsp, 0);
+ if (!ret) {
+ mask_ofl_ipi &= ~mask;
+ continue;
+ }
+ /* Failed, raced with CPU hotplug operation. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if ((rnp->qsmaskinitnext & mask) &&
+ (rnp->expmask & mask)) {
+ /* Online, so delay for a bit and try again. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
+ schedule_timeout_uninterruptible(1);
+ goto retry_ipi;
+ }
+ /* CPU really is offline, so we can ignore it. */
+ if (!(rnp->expmask & mask))
+ mask_ofl_ipi &= ~mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ /* Report quiescent states for those that went offline. */
+ mask_ofl_test |= mask_ofl_ipi;
+ if (mask_ofl_test)
+ rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+ smp_call_func_t func)
+{
+ struct rcu_node *rnp;
+
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
+ sync_exp_reset_tree(rsp);
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
+
+ /* Schedule work for each leaf rcu_node structure. */
+ rcu_for_each_leaf_node(rsp, rnp) {
+ rnp->exp_need_flush = false;
+ if (!READ_ONCE(rnp->expmask))
+ continue; /* Avoid early boot non-existent wq. */
+ rnp->rew.rew_func = func;
+ rnp->rew.rew_rsp = rsp;
+ if (!READ_ONCE(rcu_par_gp_wq) ||
+ rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ /* No workqueues yet. */
+ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
+ continue;
}
- /* Report quiescent states for those that went offline. */
- mask_ofl_test |= mask_ofl_ipi;
- if (mask_ofl_test)
- rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+ INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
+ rnp->exp_need_flush = true;
}
+
+ /* Wait for workqueue jobs (if any) to complete. */
+ rcu_for_each_leaf_node(rsp, rnp)
+ if (rnp->exp_need_flush)
+ flush_work(&rnp->rew.rew_work);
}
static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -443,15 +513,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
struct rcu_node *rnp_root = rcu_get_root(rsp);
int ret;
+ trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait"));
jiffies_stall = rcu_jiffies_till_stall_check();
jiffies_start = jiffies;
for (;;) {
ret = swait_event_timeout(
rsp->expedited_wq,
- sync_rcu_preempt_exp_done(rnp_root),
+ sync_rcu_preempt_exp_done_unlocked(rnp_root),
jiffies_stall);
- if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
+ if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root))
return;
WARN_ON(ret < 0); /* workqueues should not be signaled. */
if (rcu_cpu_stall_suppress)
@@ -484,7 +555,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
rcu_for_each_node_breadth_first(rsp, rnp) {
if (rnp == rnp_root)
continue; /* printed unconditionally */
- if (sync_rcu_preempt_exp_done(rnp))
+ if (sync_rcu_preempt_exp_done_unlocked(rnp))
continue;
pr_cont(" l=%u:%d-%d:%#lx/%c",
rnp->level, rnp->grplo, rnp->grphi,
@@ -540,14 +611,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
mutex_unlock(&rsp->exp_wake_mutex);
}
-/* Let the workqueue handler know what it is supposed to do. */
-struct rcu_exp_work {
- smp_call_func_t rew_func;
- struct rcu_state *rew_rsp;
- unsigned long rew_s;
- struct work_struct rew_work;
-};
-
/*
* Common code to drive an expedited grace period forward, used by
* workqueues and mid-boot-time tasks.
@@ -606,14 +669,14 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
rew.rew_rsp = rsp;
rew.rew_s = s;
INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- schedule_work(&rew.rew_work);
+ queue_work(rcu_gp_wq, &rew.rew_work);
}
/* Wait for expedited grace period to complete. */
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
rnp = rcu_get_root(rsp);
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
- sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
+ sync_exp_work_done(rsp, s));
smp_mb(); /* Workqueue actions happen before return. */
/* Let the next expedited grace period start. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fb88a028deec..7fd12039e512 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,9 +180,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
(rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
struct task_struct *t = current;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
WARN_ON_ONCE(rdp->mynode != rnp);
- WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
/*
* Decide where to queue the newly blocked task. In theory,
@@ -384,6 +384,50 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
}
/*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+ current->rcu_read_lock_nesting++;
+ barrier(); /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting != 1) {
+ --t->rcu_read_lock_nesting;
+ } else {
+ barrier(); /* critical section before exit code. */
+ t->rcu_read_lock_nesting = INT_MIN;
+ barrier(); /* assign before ->rcu_read_unlock_special load */
+ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
+ rcu_read_unlock_special(t);
+ barrier(); /* ->rcu_read_unlock_special load before assign */
+ t->rcu_read_lock_nesting = 0;
+ }
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ int rrln = READ_ONCE(t->rcu_read_lock_nesting);
+
+ WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+ }
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
* Advance a ->blkd_tasks-list pointer to the next entry, instead
* returning NULL if at the end of the list.
*/
@@ -489,7 +533,7 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp = t->rcu_blocked_node;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
- WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -560,8 +604,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
}
t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
sched_show_task(t);
+ }
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -679,15 +729,6 @@ static void rcu_preempt_check_callbacks(void)
t->rcu_read_unlock_special.b.need_qs = true;
}
-#ifdef CONFIG_RCU_BOOST
-
-static void rcu_preempt_do_callbacks(void)
-{
- rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
/**
* call_rcu() - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
@@ -957,14 +998,10 @@ static int rcu_boost(struct rcu_node *rnp)
* expedited grace period must boost all blocked tasks, including
* those blocking the pre-existing normal grace period.
*/
- if (rnp->exp_tasks != NULL) {
+ if (rnp->exp_tasks != NULL)
tb = rnp->exp_tasks;
- rnp->n_exp_boosts++;
- } else {
+ else
tb = rnp->boost_tasks;
- rnp->n_normal_boosts++;
- }
- rnp->n_tasks_boosted++;
/*
* We boost task t by manufacturing an rt_mutex that appears to
@@ -1042,7 +1079,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
{
struct task_struct *t;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
@@ -1138,7 +1175,7 @@ static void rcu_kthread_do_work(void)
{
rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
- rcu_preempt_do_callbacks();
+ rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
}
static void rcu_cpu_kthread_setup(unsigned int cpu)
@@ -1605,7 +1642,7 @@ static int rcu_oom_notify(struct notifier_block *self,
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
}
/* Unconditionally decrement: no need to wake ourselves up. */
@@ -1677,6 +1714,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
char *ticks_title;
unsigned long ticks_value;
+ /*
+ * We could be printing a lot while holding a spinlock. Avoid
+ * triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+
if (rsp->gpnum == rdp->gpnum) {
ticks_title = "ticks this GP";
ticks_value = rdp->ticks_this_gp;
@@ -1772,19 +1815,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
swake_up_all(sq);
}
-/*
- * Set the root rcu_node structure's ->need_future_gp field
- * based on the sum of those of all rcu_node structures. This does
- * double-count the root rcu_node structure's requests, but this
- * is necessary to handle the possibility of a rcu_nocb_kthread()
- * having awakened during the time that the rcu_node structures
- * were being updated for the end of the previous grace period.
- */
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
- rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
-}
-
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
{
return &rnp->nocb_gp_wq[rnp->completed & 0x1];
@@ -1958,7 +1988,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE,
TPS("WakeOvfIsDeferred"));
}
rdp->qlen_last_fqs_check = LONG_MAX / 2;
@@ -2040,7 +2070,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
struct rcu_node *rnp = rdp->mynode;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- needwake = rcu_start_future_gp(rnp, rdp, &c);
+ c = rcu_cbs_completed(rdp->rsp, rnp);
+ needwake = rcu_start_this_gp(rnp, rdp, c);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake(rdp->rsp);
@@ -2049,7 +2080,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
* Wait for the grace period. Do so interruptibly to avoid messing
* up the load average.
*/
- trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
+ trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
swait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
@@ -2057,9 +2088,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
if (likely(d))
break;
WARN_ON(signal_pending(current));
- trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
+ trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait"));
}
- trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
+ trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait"));
smp_mb(); /* Ensure that CB invocation happens after GP end. */
}
@@ -2228,14 +2259,13 @@ static int rcu_nocb_kthread(void *arg)
cl++;
c++;
local_bh_enable();
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
smp_mb__before_atomic(); /* _add after CB invocation. */
atomic_long_add(-c, &rdp->nocb_q_count);
atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
- rdp->n_nocbs_invoked += c;
}
return 0;
}
@@ -2285,7 +2315,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
void __init rcu_init_nohz(void)
{
int cpu;
- bool need_rcu_nocb_mask = true;
+ bool need_rcu_nocb_mask = false;
struct rcu_state *rsp;
#if defined(CONFIG_NO_HZ_FULL)
@@ -2308,12 +2338,15 @@ void __init rcu_init_nohz(void)
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
cpumask_and(rcu_nocb_mask, cpu_possible_mask,
rcu_nocb_mask);
}
- pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
- cpumask_pr_args(rcu_nocb_mask));
+ if (cpumask_empty(rcu_nocb_mask))
+ pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
+ else
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
if (rcu_nocb_poll)
pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
@@ -2485,10 +2518,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
}
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
-}
-
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
{
return NULL;
@@ -2577,8 +2606,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
}
/*
- * Bind the grace-period kthread for the sysidle flavor of RCU to the
- * timekeeping CPU.
+ * Bind the RCU grace-period kthreads to the housekeeping CPU.
*/
static void rcu_bind_gp_kthread(void)
{
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 68fa19a5e7bd..4c230a60ece4 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -226,54 +226,6 @@ core_initcall(rcu_set_runtime_mode);
#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
-#ifdef CONFIG_PREEMPT_RCU
-
-/*
- * Preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
- current->rcu_read_lock_nesting++;
- barrier(); /* critical section after entry code. */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-/*
- * Preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting != 1) {
- --t->rcu_read_lock_nesting;
- } else {
- barrier(); /* critical section before exit code. */
- t->rcu_read_lock_nesting = INT_MIN;
- barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
- rcu_read_unlock_special(t);
- barrier(); /* ->rcu_read_unlock_special load before assign */
- t->rcu_read_lock_nesting = 0;
- }
-#ifdef CONFIG_PROVE_LOCKING
- {
- int rrln = READ_ONCE(t->rcu_read_lock_nesting);
-
- WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
- }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
@@ -624,7 +576,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
* grace period has elapsed, in other words after all currently
* executing rcu-tasks read-side critical sections have elapsed. These
* read-side critical sections are delimited by calls to schedule(),
- * cond_resched_rcu_qs(), idle execution, userspace execution, calls
+ * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls
* to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
*
* This is a very specialized primitive, intended only for a few uses in
diff --git a/kernel/resource.c b/kernel/resource.c
index e270b5048988..b589dda910b3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -87,7 +87,7 @@ enum { MAX_IORES_LEVEL = 5 };
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
{
- struct resource *p = m->private;
+ struct resource *p = PDE_DATA(file_inode(m->file));
loff_t l = 0;
read_lock(&resource_lock);
for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
@@ -103,7 +103,7 @@ static void r_stop(struct seq_file *m, void *v)
static int r_show(struct seq_file *m, void *v)
{
- struct resource *root = m->private;
+ struct resource *root = PDE_DATA(file_inode(m->file));
struct resource *r = v, *p;
unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
@@ -135,44 +135,11 @@ static const struct seq_operations resource_op = {
.show = r_show,
};
-static int ioports_open(struct inode *inode, struct file *file)
-{
- int res = seq_open(file, &resource_op);
- if (!res) {
- struct seq_file *m = file->private_data;
- m->private = &ioport_resource;
- }
- return res;
-}
-
-static int iomem_open(struct inode *inode, struct file *file)
-{
- int res = seq_open(file, &resource_op);
- if (!res) {
- struct seq_file *m = file->private_data;
- m->private = &iomem_resource;
- }
- return res;
-}
-
-static const struct file_operations proc_ioports_operations = {
- .open = ioports_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static const struct file_operations proc_iomem_operations = {
- .open = iomem_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init ioresources_init(void)
{
- proc_create("ioports", 0, NULL, &proc_ioports_operations);
- proc_create("iomem", 0, NULL, &proc_iomem_operations);
+ proc_create_seq_data("ioports", 0, NULL, &resource_op,
+ &ioport_resource);
+ proc_create_seq_data("iomem", 0, NULL, &resource_op, &iomem_resource);
return 0;
}
__initcall(ioresources_init);
@@ -651,7 +618,8 @@ static int __find_resource(struct resource *root, struct resource *old,
alloc.start = constraint->alignf(constraint->alignf_data, &avail,
size, constraint->align);
alloc.end = alloc.start + size - 1;
- if (resource_contains(&avail, &alloc)) {
+ if (alloc.start <= alloc.end &&
+ resource_contains(&avail, &alloc)) {
new->start = alloc.start;
new->end = alloc.end;
return 0;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index e2f9d4feff40..d9a02b318108 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle_task.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o idle.o
+obj-y += idle.o fair.o rt.o deadline.o
+obj-y += wait.o wait_bit.o swait.o completion.o
+
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index bb4b9fe026a1..2d4ff5353ded 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,10 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/utsname.h>
-#include <linux/security.h>
-#include <linux/export.h>
-
+/*
+ * Auto-group scheduling implementation:
+ */
+#include <linux/nospec.h>
#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
@@ -168,18 +166,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
autogroup_kref_put(prev);
}
-/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
void sched_autogroup_create_attach(struct task_struct *p)
{
struct autogroup *ag = autogroup_create();
autogroup_move_group(p, ag);
- /* drop extra reference added by autogroup_create() */
+
+ /* Drop extra reference added by autogroup_create(): */
autogroup_kref_put(ag);
}
EXPORT_SYMBOL(sched_autogroup_create_attach);
-/* Cannot be called under siglock. Currently has no users */
+/* Cannot be called under siglock. Currently has no users: */
void sched_autogroup_detach(struct task_struct *p)
{
autogroup_move_group(p, &autogroup_default);
@@ -202,7 +201,6 @@ static int __init setup_autogroup(char *str)
return 1;
}
-
__setup("noautogroup", setup_autogroup);
#ifdef CONFIG_PROC_FS
@@ -212,7 +210,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
unsigned long shares;
- int err;
+ int err, idx;
if (nice < MIN_NICE || nice > MAX_NICE)
return -EINVAL;
@@ -224,13 +222,15 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
if (nice < 0 && !can_nice(current, nice))
return -EPERM;
- /* this is a heavy operation taking global locks.. */
+ /* This is a heavy operation, taking global locks.. */
if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
return -EAGAIN;
next = HZ / 10 + jiffies;
ag = autogroup_task_get(p);
- shares = scale_load(sched_prio_to_weight[nice + 20]);
+
+ idx = array_index_nospec(nice + 20, 40);
+ shares = scale_load(sched_prio_to_weight[idx]);
down_write(&ag->lock);
err = sched_group_set_shares(ag->tg, shares);
@@ -267,4 +267,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
-#endif /* CONFIG_SCHED_DEBUG */
+#endif
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 27cd22b89824..b96419974a1f 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,15 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifdef CONFIG_SCHED_AUTOGROUP
-#include <linux/kref.h>
-#include <linux/rwsem.h>
-#include <linux/sched/autogroup.h>
-
struct autogroup {
/*
- * reference doesn't mean how many thread attach to this
- * autogroup now. It just stands for the number of task
- * could use this autogroup.
+ * Reference doesn't mean how many threads attach to this
+ * autogroup now. It just stands for the number of tasks
+ * which could use this autogroup.
*/
struct kref kref;
struct task_group *tg;
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
return tg;
}
-#ifdef CONFIG_SCHED_DEBUG
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
return 0;
}
-#endif
#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e086babe6c61..10c83e73837a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,5 +1,5 @@
/*
- * sched_clock for unstable cpu clocks
+ * sched_clock() for unstable CPU clocks
*
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
*
@@ -11,7 +11,7 @@
* Guillaume Chazarain <guichaz@gmail.com>
*
*
- * What:
+ * What this file implements:
*
* cpu_clock(i) provides a fast (execution time) high resolution
* clock with bounded drift between CPUs. The value of cpu_clock(i)
@@ -26,11 +26,11 @@
* at 0 on boot (but people really shouldn't rely on that).
*
* cpu_clock(i) -- can be used from any context, including NMI.
- * local_clock() -- is cpu_clock() on the current cpu.
+ * local_clock() -- is cpu_clock() on the current CPU.
*
* sched_clock_cpu(i)
*
- * How:
+ * How it is implemented:
*
* The implementation either uses sched_clock() when
* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
@@ -52,19 +52,7 @@
* that is otherwise invisible (TSC gets stopped).
*
*/
-#include <linux/spinlock.h>
-#include <linux/hardirq.h>
-#include <linux/export.h>
-#include <linux/percpu.h>
-#include <linux/ktime.h>
-#include <linux/sched.h>
-#include <linux/nmi.h>
-#include <linux/sched/clock.h>
-#include <linux/static_key.h>
-#include <linux/workqueue.h>
-#include <linux/compiler.h>
-#include <linux/tick.h>
-#include <linux/init.h>
+#include "sched.h"
/*
* Scheduler clock - returns current time in nanosec units.
@@ -302,21 +290,21 @@ again:
* cmpxchg64 below only protects one readout.
*
* We must reread via sched_clock_local() in the retry case on
- * 32bit as an NMI could use sched_clock_local() via the
+ * 32-bit kernels as an NMI could use sched_clock_local() via the
* tracer and hit between the readout of
- * the low32bit and the high 32bit portion.
+ * the low 32-bit and the high 32-bit portion.
*/
this_clock = sched_clock_local(my_scd);
/*
- * We must enforce atomic readout on 32bit, otherwise the
- * update on the remote cpu can hit inbetween the readout of
- * the low32bit and the high 32bit portion.
+ * We must enforce atomic readout on 32-bit, otherwise the
+ * update on the remote CPU can hit inbetween the readout of
+ * the low 32-bit and the high 32-bit portion.
*/
remote_clock = cmpxchg64(&scd->clock, 0, 0);
#else
/*
- * On 64bit the read of [my]scd->clock is atomic versus the
- * update, so we can avoid the above 32bit dance.
+ * On 64-bit kernels the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32-bit dance.
*/
sched_clock_local(my_scd);
again:
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 0926aef10dad..e426b0cb9ac6 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,10 +11,7 @@
* typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
-
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/completion.h>
+#include "sched.h"
/**
* complete: - signals a single thread waiting on this completion
@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
bool try_wait_for_completion(struct completion *x)
{
unsigned long flags;
- int ret = 1;
+ bool ret = true;
/*
* Since x->done will need to be locked only
@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x)
* return early in the blocking case.
*/
if (!READ_ONCE(x->done))
- return 0;
+ return false;
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
- ret = 0;
+ ret = false;
else if (x->done != UINT_MAX)
x->done--;
spin_unlock_irqrestore(&x->wait.lock, flags);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7c535eee0a6..e9866f86f304 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5,37 +5,14 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <uapi/linux/sched/types.h>
-#include <linux/sched/loadavg.h>
-#include <linux/sched/hotplug.h>
-#include <linux/wait_bit.h>
-#include <linux/cpuset.h>
-#include <linux/delayacct.h>
-#include <linux/init_task.h>
-#include <linux/context_tracking.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/compat.h>
-
-#include <linux/blkdev.h>
-#include <linux/kprobes.h>
-#include <linux/mmu_context.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/prefetch.h>
-#include <linux/profile.h>
-#include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/sched/isolation.h>
+#include "sched.h"
+
+#include <linux/kthread.h>
+#include <linux/nospec.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-#include "sched.h"
#include "../workqueue_internal.h"
#include "../smpboot.h"
@@ -135,7 +112,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* [L] ->on_rq
* RELEASE (rq->lock)
*
- * If we observe the old cpu in task_rq_lock, the acquire of
+ * If we observe the old CPU in task_rq_lock, the acquire of
* the old rq->lock will fully serialize against the stores.
*
* If we observe the new CPU in task_rq_lock, the acquire will
@@ -333,7 +310,7 @@ void hrtick_start(struct rq *rq, u64 delay)
}
#endif /* CONFIG_SMP */
-static void init_rq_hrtick(struct rq *rq)
+static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
rq->hrtick_csd_pending = 0;
@@ -351,7 +328,7 @@ static inline void hrtick_clear(struct rq *rq)
{
}
-static inline void init_rq_hrtick(struct rq *rq)
+static inline void hrtick_rq_init(struct rq *rq)
{
}
#endif /* CONFIG_SCHED_HRTICK */
@@ -609,7 +586,7 @@ static inline bool got_nohz_idle_kick(void)
{
int cpu = smp_processor_id();
- if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+ if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
return false;
if (idle_cpu(cpu) && !need_resched())
@@ -619,7 +596,7 @@ static inline bool got_nohz_idle_kick(void)
* We can't run Idle Load Balance on this CPU for this time so we
* cancel it and clear NOHZ_BALANCE_KICK
*/
- clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
return false;
}
@@ -900,10 +877,37 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* this case, we can save a useless back to back clock update.
*/
if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
- rq_clock_skip_update(rq, true);
+ rq_clock_skip_update(rq);
}
#ifdef CONFIG_SMP
+
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+ if (!(p->flags & PF_KTHREAD))
+ return false;
+
+ if (p->nr_cpus_allowed != 1)
+ return false;
+
+ return true;
+}
+
+/*
+ * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * __set_cpus_allowed_ptr() and select_fallback_rq().
+ */
+static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+{
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ return false;
+
+ if (is_per_cpu_kthread(p))
+ return cpu_online(cpu);
+
+ return cpu_active(cpu);
+}
+
/*
* This is how migration works:
*
@@ -961,16 +965,8 @@ struct migration_arg {
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
- if (p->flags & PF_KTHREAD) {
- if (unlikely(!cpu_online(dest_cpu)))
- return rq;
- } else {
- if (unlikely(!cpu_active(dest_cpu)))
- return rq;
- }
-
/* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ if (!is_cpu_allowed(p, dest_cpu))
return rq;
update_rq_clock(rq);
@@ -1457,7 +1453,7 @@ EXPORT_SYMBOL_GPL(kick_process);
*
* - cpu_active must be a subset of cpu_online
*
- * - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
* see __set_cpus_allowed_ptr(). At this point the newly online
* CPU isn't yet part of the sched domains, and balancing will not
* see it.
@@ -1499,10 +1495,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for (;;) {
/* Any allowed, online CPU? */
for_each_cpu(dest_cpu, &p->cpus_allowed) {
- if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
- continue;
- if (!cpu_online(dest_cpu))
+ if (!is_cpu_allowed(p, dest_cpu))
continue;
+
goto out;
}
@@ -1565,8 +1560,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
* [ this allows ->select_task() to simply return task_cpu(p) and
* not worry about this generic constraint ]
*/
- if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
- !cpu_online(cpu)))
+ if (unlikely(!is_cpu_allowed(p, cpu)))
cpu = select_fallback_rq(task_cpu(p), p);
return cpu;
@@ -2200,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
-#ifdef CONFIG_NUMA_BALANCING
- if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
- p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
- p->mm->numa_scan_seq = 0;
- }
-
- if (clone_flags & CLONE_VM)
- p->numa_preferred_nid = current->numa_preferred_nid;
- else
- p->numa_preferred_nid = -1;
-
- p->node_stamp = 0ULL;
- p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
- p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- p->last_task_numa_placement = 0;
- p->last_sum_exec_runtime = 0;
-
- p->numa_group = NULL;
-#endif /* CONFIG_NUMA_BALANCING */
+ init_numa_balancing(clone_flags, p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
-static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
void preempt_notifier_inc(void)
{
- static_key_slow_inc(&preempt_notifier_key);
+ static_branch_inc(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_inc);
void preempt_notifier_dec(void)
{
- static_key_slow_dec(&preempt_notifier_key);
+ static_branch_dec(&preempt_notifier_key);
}
EXPORT_SYMBOL_GPL(preempt_notifier_dec);
@@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec);
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
- if (!static_key_false(&preempt_notifier_key))
+ if (!static_branch_unlikely(&preempt_notifier_key))
WARN(1, "registering preempt_notifier while notifiers disabled\n");
hlist_add_head(&notifier->link, &current->preempt_notifiers);
@@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
- if (static_key_false(&preempt_notifier_key))
+ if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_in_preempt_notifiers(curr);
}
@@ -2555,7 +2529,7 @@ static __always_inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
- if (static_key_false(&preempt_notifier_key))
+ if (static_branch_unlikely(&preempt_notifier_key))
__fire_sched_out_preempt_notifiers(curr, next);
}
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
raw_spin_unlock_irq(&rq->lock);
}
+/*
+ * NOP if the arch has not defined these:
+ */
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
+
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
@@ -2732,20 +2718,28 @@ static struct rq *finish_task_switch(struct task_struct *prev)
membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm);
}
- if (unlikely(prev_state == TASK_DEAD)) {
- if (prev->sched_class->task_dead)
- prev->sched_class->task_dead(prev);
+ if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) {
+ switch (prev_state) {
+ case TASK_DEAD:
+ if (prev->sched_class->task_dead)
+ prev->sched_class->task_dead(prev);
- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);
+
+ /* Task is done with its stack. */
+ put_task_stack(prev);
- /* Task is done with its stack. */
- put_task_stack(prev);
+ put_task_struct(prev);
+ break;
- put_task_struct(prev);
+ case TASK_PARKED:
+ kthread_park_complete(prev);
+ break;
+ }
}
tick_nohz_task_switch();
@@ -3037,7 +3031,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
- * 64-bit doesn't need locks to atomically read a 64bit value.
+ * 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
* Reading ->on_cpu is racy, but this is ok.
*
@@ -3096,35 +3090,99 @@ void scheduler_tick(void)
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
#endif
- rq_last_tick_reset(rq);
}
#ifdef CONFIG_NO_HZ_FULL
-/**
- * scheduler_tick_max_deferment
- *
- * Keep at least one tick per second when a single
- * active task is running because the scheduler doesn't
- * yet completely support full dynticks environment.
- *
- * This makes sure that uptime, CFS vruntime, load
- * balancing, etc... continue to move forward, even
- * with a very low granularity.
- *
- * Return: Maximum deferment in nanoseconds.
- */
-u64 scheduler_tick_max_deferment(void)
+
+struct tick_work {
+ int cpu;
+ struct delayed_work work;
+};
+
+static struct tick_work __percpu *tick_work_cpu;
+
+static void sched_tick_remote(struct work_struct *work)
{
- struct rq *rq = this_rq();
- unsigned long next, now = READ_ONCE(jiffies);
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct tick_work *twork = container_of(dwork, struct tick_work, work);
+ int cpu = twork->cpu;
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
- next = rq->last_sched_tick + HZ;
+ /*
+ * Handle the tick only if it appears the remote CPU is running in full
+ * dynticks mode. The check is racy by nature, but missing a tick or
+ * having one too much is no big deal because the scheduler tick updates
+ * statistics and checks timeslices in a time-independent way, regardless
+ * of when exactly it is running.
+ */
+ if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
+ struct task_struct *curr;
+ u64 delta;
- if (time_before_eq(next, now))
- return 0;
+ rq_lock_irq(rq, &rf);
+ update_rq_clock(rq);
+ curr = rq->curr;
+ delta = rq_clock_task(rq) - curr->se.exec_start;
- return jiffies_to_nsecs(next - now);
+ /*
+ * Make sure the next tick runs within a reasonable
+ * amount of time.
+ */
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ curr->sched_class->task_tick(rq, curr, 0);
+ rq_unlock_irq(rq, &rf);
+ }
+
+ /*
+ * Run the remote tick once per second (1Hz). This arbitrary
+ * frequency is large enough to avoid overload but short enough
+ * to keep scheduler internal stats reasonably up to date.
+ */
+ queue_delayed_work(system_unbound_wq, dwork, HZ);
}
+
+static void sched_tick_start(int cpu)
+{
+ struct tick_work *twork;
+
+ if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ return;
+
+ WARN_ON_ONCE(!tick_work_cpu);
+
+ twork = per_cpu_ptr(tick_work_cpu, cpu);
+ twork->cpu = cpu;
+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_tick_stop(int cpu)
+{
+ struct tick_work *twork;
+
+ if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ return;
+
+ WARN_ON_ONCE(!tick_work_cpu);
+
+ twork = per_cpu_ptr(tick_work_cpu, cpu);
+ cancel_delayed_work_sync(&twork->work);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __init sched_tick_offload_init(void)
+{
+ tick_work_cpu = alloc_percpu(struct tick_work);
+ BUG_ON(!tick_work_cpu);
+
+ return 0;
+}
+
+#else /* !CONFIG_NO_HZ_FULL */
+static inline void sched_tick_start(int cpu) { }
+static inline void sched_tick_stop(int cpu) { }
#endif
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -3448,23 +3506,8 @@ static void __sched notrace __schedule(bool preempt)
void __noreturn do_task_dead(void)
{
- /*
- * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
- * when the following two conditions become true.
- * - There is race condition of mmap_sem (It is acquired by
- * exit_mm()), and
- * - SMI occurs before setting TASK_RUNINNG.
- * (or hypervisor of virtual machine switches to other guest)
- * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
- *
- * To avoid it, we have to wait for releasing tsk->pi_lock which
- * is held by try_to_wake_up()
- */
- raw_spin_lock_irq(&current->pi_lock);
- raw_spin_unlock_irq(&current->pi_lock);
-
/* Causes final put_task_struct in finish_task_switch(): */
- __set_current_state(TASK_DEAD);
+ set_special_state(TASK_DEAD);
/* Tell freezer to ignore us: */
current->flags |= PF_NOFREEZE;
@@ -3987,6 +4030,23 @@ int idle_cpu(int cpu)
}
/**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+ if (!idle_cpu(cpu))
+ return 0;
+
+ if (vcpu_is_preempted(cpu))
+ return 0;
+
+ return 1;
+}
+
+/**
* idle_task - return the idle task for a given CPU.
* @cpu: the processor in question.
*
@@ -4892,7 +4952,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
*
* Return: 0.
*/
-SYSCALL_DEFINE0(sched_yield)
+static void do_sched_yield(void)
{
struct rq_flags rf;
struct rq *rq;
@@ -4913,7 +4973,11 @@ SYSCALL_DEFINE0(sched_yield)
sched_preempt_enable_no_resched();
schedule();
+}
+SYSCALL_DEFINE0(sched_yield)
+{
+ do_sched_yield();
return 0;
}
@@ -4958,20 +5022,6 @@ int __cond_resched_lock(spinlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_lock);
-int __sched __cond_resched_softirq(void)
-{
- BUG_ON(!in_softirq());
-
- if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
- local_bh_enable();
- preempt_schedule_common();
- local_bh_disable();
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-
/**
* yield - yield the current processor to other threads.
*
@@ -4997,7 +5047,7 @@ EXPORT_SYMBOL(__cond_resched_softirq);
void __sched yield(void)
{
set_current_state(TASK_RUNNING);
- sys_sched_yield();
+ do_sched_yield();
}
EXPORT_SYMBOL(yield);
@@ -5506,6 +5556,7 @@ void idle_task_exit(void)
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
+ current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
mmdrop(mm);
@@ -5786,6 +5837,7 @@ int sched_cpu_starting(unsigned int cpu)
{
set_cpu_rq_start_time(cpu);
sched_rq_cpu_starting(cpu);
+ sched_tick_start(cpu);
return 0;
}
@@ -5797,6 +5849,7 @@ int sched_cpu_dying(unsigned int cpu)
/* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending();
+ sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
@@ -5809,7 +5862,7 @@ int sched_cpu_dying(unsigned int cpu)
calc_load_migrate(rq);
update_max_interval();
- nohz_balance_exit_idle(cpu);
+ nohz_balance_exit_idle(rq);
hrtick_clear(rq);
return 0;
}
@@ -6022,13 +6075,11 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_load_update_tick = jiffies;
- rq->nohz_flags = 0;
-#endif
-#ifdef CONFIG_NO_HZ_FULL
- rq->last_sched_tick = 0;
+ rq->last_blocked_load_update_tick = jiffies;
+ atomic_set(&rq->nohz_flags, 0);
#endif
#endif /* CONFIG_SMP */
- init_rq_hrtick(rq);
+ hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
}
@@ -6683,13 +6734,18 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
parent_quota = parent_b->hierarchical_quota;
/*
- * Ensure max(child_quota) <= parent_quota, inherit when no
+ * Ensure max(child_quota) <= parent_quota. On cgroup2,
+ * always take the min. On cgroup1, only inherit when no
* limit is set:
*/
- if (quota == RUNTIME_INF)
- quota = parent_quota;
- else if (parent_quota != RUNTIME_INF && quota > parent_quota)
- return -EINVAL;
+ if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
+ quota = min(quota, parent_quota);
+ } else {
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+ return -EINVAL;
+ }
}
cfs_b->hierarchical_quota = quota;
@@ -6868,11 +6924,15 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 nice)
{
unsigned long weight;
+ int idx;
if (nice < MIN_NICE || nice > MAX_NICE)
return -ERANGE;
- weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+ idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
+ idx = array_index_nospec(idx, 40);
+ weight = sched_prio_to_weight[idx];
+
return sched_group_set_shares(css_tg(css), scale_load(weight));
}
#endif
@@ -7022,3 +7082,5 @@ const u32 sched_prio_to_wmult[40] = {
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
+
+#undef CREATE_TRACE_POINTS
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 44ab32a4fab6..9fbb10383434 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,24 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/cgroup.h>
-#include <linux/slab.h>
-#include <linux/percpu.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/seq_file.h>
-#include <linux/rcupdate.h>
-#include <linux/kernel_stat.h>
-#include <linux/err.h>
-
-#include "sched.h"
-
/*
* CPU accounting code for task groups.
*
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
+#include "sched.h"
-/* Time spent by the tasks of the cpu accounting group executing in ... */
+/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
CPUACCT_STAT_USER, /* ... user mode */
CPUACCT_STAT_SYSTEM, /* ... kernel mode */
@@ -35,12 +24,12 @@ struct cpuacct_usage {
u64 usages[CPUACCT_STAT_NSTATS];
};
-/* track cpu usage of a group of tasks and its child groups */
+/* track CPU usage of a group of tasks and its child groups */
struct cpuacct {
- struct cgroup_subsys_state css;
- /* cpuusage holds pointer to a u64-type object on every cpu */
- struct cpuacct_usage __percpu *cpuusage;
- struct kernel_cpustat __percpu *cpustat;
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every CPU */
+ struct cpuacct_usage __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
};
static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
return css ? container_of(css, struct cpuacct, css) : NULL;
}
-/* return cpu accounting group to which this task belongs */
+/* Return CPU accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
return css_ca(task_css(tsk, cpuacct_cgrp_id));
@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = {
.cpuusage = &root_cpuacct_cpuusage,
};
-/* create a new cpu accounting group */
+/* Create a new CPU accounting group */
static struct cgroup_subsys_state *
cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -96,7 +85,7 @@ out:
return ERR_PTR(-ENOMEM);
}
-/* destroy an existing cpu accounting group */
+/* Destroy an existing CPU accounting group */
static void cpuacct_css_free(struct cgroup_subsys_state *css)
{
struct cpuacct *ca = css_ca(css);
@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
#endif
}
-/* return total cpu usage (in nanoseconds) of a group */
+/* Return total CPU usage (in nanoseconds) of a group */
static u64 __cpuusage_read(struct cgroup_subsys_state *css,
enum cpuacct_stat_index index)
{
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8d9562d890d3..50316455ea66 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -10,11 +10,7 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
-
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include "cpudeadline.h"
+#include "sched.h"
static inline int parent(int i)
{
@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx)
return;
/* adapted from lib/prio_heap.c */
- while(1) {
+ while (1) {
u64 largest_dl;
+
l = left_child(idx);
r = right_child(idx);
largest = idx;
@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
+
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
}
/*
- * cpudl_clear - remove a cpu from the cpudl max-heap
+ * cpudl_clear - remove a CPU from the cpudl max-heap
* @cp: the cpudl max-heap context
- * @cpu: the target cpu
+ * @cpu: the target CPU
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
/*
* cpudl_set - update the cpudl max-heap
* @cp: the cpudl max-heap context
- * @cpu: the target cpu
- * @dl: the new earliest deadline for this cpu
+ * @cpu: the target CPU
+ * @dl: the new earliest deadline for this CPU
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
old_idx = cp->elements[cpu].idx;
if (old_idx == IDX_INVALID) {
int new_idx = cp->size++;
+
cp->elements[new_idx].dl = dl;
cp->elements[new_idx].cpu = cpu;
cp->elements[cpu].idx = new_idx;
@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
/*
* cpudl_set_freecpu - Set the cpudl.free_cpus
* @cp: the cpudl max-heap context
- * @cpu: rd attached cpu
+ * @cpu: rd attached CPU
*/
void cpudl_set_freecpu(struct cpudl *cp, int cpu)
{
@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
/*
* cpudl_clear_freecpu - Clear the cpudl.free_cpus
* @cp: the cpudl max-heap context
- * @cpu: rd attached cpu
+ * @cpu: rd attached CPU
*/
void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
{
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index b010d26e108e..0adeda93b5fb 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,35 +1,26 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_CPUDL_H
-#define _LINUX_CPUDL_H
-#include <linux/sched.h>
-#include <linux/sched/deadline.h>
-
-#define IDX_INVALID -1
+#define IDX_INVALID -1
struct cpudl_item {
- u64 dl;
- int cpu;
- int idx;
+ u64 dl;
+ int cpu;
+ int idx;
};
struct cpudl {
- raw_spinlock_t lock;
- int size;
- cpumask_var_t free_cpus;
- struct cpudl_item *elements;
+ raw_spinlock_t lock;
+ int size;
+ cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
};
-
#ifdef CONFIG_SMP
-int cpudl_find(struct cpudl *cp, struct task_struct *p,
- struct cpumask *later_mask);
+int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
void cpudl_clear(struct cpudl *cp, int cpu);
-int cpudl_init(struct cpudl *cp);
+int cpudl_init(struct cpudl *cp);
void cpudl_set_freecpu(struct cpudl *cp, int cpu);
void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
#endif /* CONFIG_SMP */
-
-#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index dbc51442ecbc..5e54cbcae673 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -8,7 +8,6 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
#include "sched.h"
DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7936f548e071..3cde46483f0a 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -11,61 +11,56 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cpufreq.h>
-#include <linux/kthread.h>
-#include <uapi/linux/sched/types.h>
-#include <linux/slab.h>
-#include <trace/events/power.h>
-
#include "sched.h"
+#include <trace/events/power.h>
+
struct sugov_tunables {
- struct gov_attr_set attr_set;
- unsigned int rate_limit_us;
+ struct gov_attr_set attr_set;
+ unsigned int rate_limit_us;
};
struct sugov_policy {
- struct cpufreq_policy *policy;
-
- struct sugov_tunables *tunables;
- struct list_head tunables_hook;
-
- raw_spinlock_t update_lock; /* For shared policies */
- u64 last_freq_update_time;
- s64 freq_update_delay_ns;
- unsigned int next_freq;
- unsigned int cached_raw_freq;
-
- /* The next fields are only needed if fast switch cannot be used. */
- struct irq_work irq_work;
- struct kthread_work work;
- struct mutex work_lock;
- struct kthread_worker worker;
- struct task_struct *thread;
- bool work_in_progress;
-
- bool need_freq_update;
+ struct cpufreq_policy *policy;
+
+ struct sugov_tunables *tunables;
+ struct list_head tunables_hook;
+
+ raw_spinlock_t update_lock; /* For shared policies */
+ u64 last_freq_update_time;
+ s64 freq_update_delay_ns;
+ unsigned int next_freq;
+ unsigned int cached_raw_freq;
+
+ /* The next fields are only needed if fast switch cannot be used: */
+ struct irq_work irq_work;
+ struct kthread_work work;
+ struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
+ bool work_in_progress;
+
+ bool need_freq_update;
};
struct sugov_cpu {
- struct update_util_data update_util;
- struct sugov_policy *sg_policy;
- unsigned int cpu;
+ struct update_util_data update_util;
+ struct sugov_policy *sg_policy;
+ unsigned int cpu;
- bool iowait_boost_pending;
- unsigned int iowait_boost;
- unsigned int iowait_boost_max;
- u64 last_update;
+ bool iowait_boost_pending;
+ unsigned int iowait_boost;
+ unsigned int iowait_boost_max;
+ u64 last_update;
- /* The fields below are only needed when sharing a policy. */
- unsigned long util_cfs;
- unsigned long util_dl;
- unsigned long max;
- unsigned int flags;
+ /* The fields below are only needed when sharing a policy: */
+ unsigned long util_cfs;
+ unsigned long util_dl;
+ unsigned long max;
- /* The field below is for single-CPU policies only. */
+ /* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
- unsigned long saved_idle_calls;
+ unsigned long saved_idle_calls;
#endif
};
@@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
/*
* Since cpufreq_update_util() is called with rq->lock held for
- * the @target_cpu, our per-cpu data is fully serialized.
+ * the @target_cpu, our per-CPU data is fully serialized.
*
- * However, drivers cannot in general deal with cross-cpu
+ * However, drivers cannot in general deal with cross-CPU
* requests, so while get_next_freq() will work, our
* sugov_update_commit() call may not for the fast switching platforms.
*
@@ -94,45 +89,52 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
* schedule the kthread.
*/
if (sg_policy->policy->fast_switch_enabled &&
- !cpufreq_can_do_remote_dvfs(sg_policy->policy))
- return false;
-
- if (sg_policy->work_in_progress)
+ !cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (unlikely(sg_policy->need_freq_update)) {
- sg_policy->need_freq_update = false;
- /*
- * This happens when limits change, so forget the previous
- * next_freq value and force an update.
- */
- sg_policy->next_freq = UINT_MAX;
+ if (unlikely(sg_policy->need_freq_update))
return true;
- }
delta_ns = time - sg_policy->last_freq_update_time;
+
return delta_ns >= sg_policy->freq_update_delay_ns;
}
-static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
+static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
{
- struct cpufreq_policy *policy = sg_policy->policy;
-
if (sg_policy->next_freq == next_freq)
- return;
+ return false;
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
- if (policy->fast_switch_enabled) {
- next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (!next_freq)
- return;
+ return true;
+}
- policy->cur = next_freq;
- trace_cpu_frequency(next_freq, smp_processor_id());
- } else {
+static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+
+ if (!sugov_update_next_freq(sg_policy, time, next_freq))
+ return;
+
+ next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+ if (!next_freq)
+ return;
+
+ policy->cur = next_freq;
+ trace_cpu_frequency(next_freq, smp_processor_id());
+}
+
+static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ if (!sugov_update_next_freq(sg_policy, time, next_freq))
+ return;
+
+ if (!sg_policy->work_in_progress) {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -169,8 +171,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
freq = (freq + (freq >> 2)) * util / max;
- if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
+
+ sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
@@ -186,51 +190,138 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{
+ struct rq *rq = cpu_rq(sg_cpu->cpu);
+
+ if (rq->rt.rt_nr_running)
+ return sg_cpu->max;
+
/*
+ * Utilization required by DEADLINE must always be granted while, for
+ * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
+ * gracefully reduce the frequency when no tasks show up for longer
+ * periods of time.
+ *
* Ideally we would like to set util_dl as min/guaranteed freq and
* util_cfs + util_dl as requested freq. However, cpufreq is not yet
* ready for such an interface. So, we only do the latter for now.
*/
- return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max);
+ return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
}
-static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time)
+/**
+ * sugov_iowait_reset() - Reset the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @set_iowait_boost: true if an IO boost has been requested
+ *
+ * The IO wait boost of a task is disabled after a tick since the last update
+ * of a CPU. If a new IO wait boost is requested after more then a tick, then
+ * we enable the boost starting from the minimum frequency, which improves
+ * energy efficiency by ignoring sporadic wakeups from IO.
+ */
+static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
+ bool set_iowait_boost)
{
- if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) {
- if (sg_cpu->iowait_boost_pending)
- return;
+ s64 delta_ns = time - sg_cpu->last_update;
- sg_cpu->iowait_boost_pending = true;
+ /* Reset boost only if a tick has elapsed since last request */
+ if (delta_ns <= TICK_NSEC)
+ return false;
- if (sg_cpu->iowait_boost) {
- sg_cpu->iowait_boost <<= 1;
- if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
- } else {
- sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
- }
- } else if (sg_cpu->iowait_boost) {
- s64 delta_ns = time - sg_cpu->last_update;
+ sg_cpu->iowait_boost = set_iowait_boost
+ ? sg_cpu->sg_policy->policy->min : 0;
+ sg_cpu->iowait_boost_pending = set_iowait_boost;
- /* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC) {
- sg_cpu->iowait_boost = 0;
- sg_cpu->iowait_boost_pending = false;
- }
+ return true;
+}
+
+/**
+ * sugov_iowait_boost() - Updates the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
+ *
+ * Each time a task wakes up after an IO operation, the CPU utilization can be
+ * boosted to a certain utilization which doubles at each "frequent and
+ * successive" wakeup from IO, ranging from the utilization of the minimum
+ * OPP to the utilization of the maximum OPP.
+ * To keep doubling, an IO boost has to be requested at least once per tick,
+ * otherwise we restart from the utilization of the minimum OPP.
+ */
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
+
+ /* Reset boost if the CPU appears to have been idle enough */
+ if (sg_cpu->iowait_boost &&
+ sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
+ return;
+
+ /* Boost only tasks waking up after IO */
+ if (!set_iowait_boost)
+ return;
+
+ /* Ensure boost doubles only one time at each request */
+ if (sg_cpu->iowait_boost_pending)
+ return;
+ sg_cpu->iowait_boost_pending = true;
+
+ /* Double the boost at each request */
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost <<= 1;
+ if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ return;
}
+
+ /* First wakeup after IO: start with minimum boost */
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
}
-static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
- unsigned long *max)
+/**
+ * sugov_iowait_apply() - Apply the IO boost to a CPU.
+ * @sg_cpu: the sugov data for the cpu to boost
+ * @time: the update time from the caller
+ * @util: the utilization to (eventually) boost
+ * @max: the maximum value the utilization can be boosted to
+ *
+ * A CPU running a task which woken up after an IO operation can have its
+ * utilization boosted to speed up the completion of those IO operations.
+ * The IO boost value is increased each time a task wakes up from IO, in
+ * sugov_iowait_apply(), and it's instead decreased by this function,
+ * each time an increase has not been requested (!iowait_boost_pending).
+ *
+ * A CPU which also appears to have been idle for at least one tick has also
+ * its IO boost utilization reset.
+ *
+ * This mechanism is designed to boost high frequently IO waiting tasks, while
+ * being more conservative on tasks which does sporadic IO operations.
+ */
+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long *util, unsigned long *max)
{
unsigned int boost_util, boost_max;
+ /* No boost currently required */
if (!sg_cpu->iowait_boost)
return;
+ /* Reset boost if the CPU appears to have been idle enough */
+ if (sugov_iowait_reset(sg_cpu, time, false))
+ return;
+
+ /*
+ * An IO waiting task has just woken up:
+ * allow to further double the boost value
+ */
if (sg_cpu->iowait_boost_pending) {
sg_cpu->iowait_boost_pending = false;
} else {
+ /*
+ * Otherwise: reduce the boost value and disable it when we
+ * reach the minimum.
+ */
sg_cpu->iowait_boost >>= 1;
if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
sg_cpu->iowait_boost = 0;
@@ -238,9 +329,12 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
}
}
+ /*
+ * Apply the current boost value: a CPU is boosted only if its current
+ * utilization is smaller then the current IO boost level.
+ */
boost_util = sg_cpu->iowait_boost;
boost_max = sg_cpu->iowait_boost_max;
-
if (*util * boost_max < *max * boost_util) {
*util = boost_util;
*max = boost_max;
@@ -260,44 +354,63 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
#endif /* CONFIG_NO_HZ_COMMON */
+/*
+ * Make sugov_should_update_freq() ignore the rate limit when DL
+ * has increased the utilization.
+ */
+static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
+{
+ if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
+ sg_policy->need_freq_update = true;
+}
+
static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util, max;
unsigned int next_f;
bool busy;
- sugov_set_iowait_boost(sg_cpu, time);
+ sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
+ ignore_dl_rate_limit(sg_cpu, sg_policy);
+
if (!sugov_should_update_freq(sg_policy, time))
return;
busy = sugov_cpu_is_busy(sg_cpu);
- if (flags & SCHED_CPUFREQ_RT) {
- next_f = policy->cpuinfo.max_freq;
- } else {
- sugov_get_util(sg_cpu);
- max = sg_cpu->max;
- util = sugov_aggregate_util(sg_cpu);
- sugov_iowait_boost(sg_cpu, &util, &max);
- next_f = get_next_freq(sg_policy, util, max);
- /*
- * Do not reduce the frequency if the CPU has not been idle
- * recently, as the reduction is likely to be premature then.
- */
- if (busy && next_f < sg_policy->next_freq) {
- next_f = sg_policy->next_freq;
+ sugov_get_util(sg_cpu);
+ max = sg_cpu->max;
+ util = sugov_aggregate_util(sg_cpu);
+ sugov_iowait_apply(sg_cpu, time, &util, &max);
+ next_f = get_next_freq(sg_policy, util, max);
+ /*
+ * Do not reduce the frequency if the CPU has not been idle
+ * recently, as the reduction is likely to be premature then.
+ */
+ if (busy && next_f < sg_policy->next_freq) {
+ next_f = sg_policy->next_freq;
- /* Reset cached freq as next_freq has changed */
- sg_policy->cached_raw_freq = 0;
- }
+ /* Reset cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = 0;
+ }
+
+ /*
+ * This code runs under rq->lock for the target CPU, so it won't run
+ * concurrently on two different CPUs for the same target and it is not
+ * necessary to acquire the lock in the fast switch case.
+ */
+ if (sg_policy->policy->fast_switch_enabled) {
+ sugov_fast_switch(sg_policy, time, next_f);
+ } else {
+ raw_spin_lock(&sg_policy->update_lock);
+ sugov_deferred_update(sg_policy, time, next_f);
+ raw_spin_unlock(&sg_policy->update_lock);
}
- sugov_update_commit(sg_policy, time, next_f);
}
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
@@ -310,43 +423,23 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
- s64 delta_ns;
-
- /*
- * If the CFS CPU utilization was last updated before the
- * previous frequency update and the time elapsed between the
- * last update of the CPU utilization and the last frequency
- * update is long enough, reset iowait_boost and util_cfs, as
- * they are now probably stale. However, still consider the
- * CPU contribution if it has some DEADLINE utilization
- * (util_dl).
- */
- delta_ns = time - j_sg_cpu->last_update;
- if (delta_ns > TICK_NSEC) {
- j_sg_cpu->iowait_boost = 0;
- j_sg_cpu->iowait_boost_pending = false;
- j_sg_cpu->util_cfs = 0;
- if (j_sg_cpu->util_dl == 0)
- continue;
- }
- if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
- return policy->cpuinfo.max_freq;
+ sugov_get_util(j_sg_cpu);
j_max = j_sg_cpu->max;
j_util = sugov_aggregate_util(j_sg_cpu);
+ sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
+
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
}
-
- sugov_iowait_boost(j_sg_cpu, &util, &max);
}
return get_next_freq(sg_policy, util, max);
}
-static void sugov_update_shared(struct update_util_data *hook, u64 time,
- unsigned int flags)
+static void
+sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -354,19 +447,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
raw_spin_lock(&sg_policy->update_lock);
- sugov_get_util(sg_cpu);
- sg_cpu->flags = flags;
-
- sugov_set_iowait_boost(sg_cpu, time);
+ sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
+ ignore_dl_rate_limit(sg_cpu, sg_policy);
+
if (sugov_should_update_freq(sg_policy, time)) {
- if (flags & SCHED_CPUFREQ_RT)
- next_f = sg_policy->policy->cpuinfo.max_freq;
- else
- next_f = sugov_next_freq_shared(sg_cpu, time);
+ next_f = sugov_next_freq_shared(sg_cpu, time);
- sugov_update_commit(sg_policy, time, next_f);
+ if (sg_policy->policy->fast_switch_enabled)
+ sugov_fast_switch(sg_policy, time, next_f);
+ else
+ sugov_deferred_update(sg_policy, time, next_f);
}
raw_spin_unlock(&sg_policy->update_lock);
@@ -375,13 +467,27 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+ unsigned int freq;
+ unsigned long flags;
+
+ /*
+ * Hold sg_policy->update_lock shortly to handle the case where:
+ * incase sg_policy->next_freq is read here, and then updated by
+ * sugov_deferred_update() just before work_in_progress is set to false
+ * here, we may miss queueing the new update.
+ *
+ * Note: If a work was queued after the update_lock is released,
+ * sugov_work() will just be called again by kthread_work code; and the
+ * request will be proceed before the sugov thread sleeps.
+ */
+ raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
+ freq = sg_policy->next_freq;
+ sg_policy->work_in_progress = false;
+ raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
mutex_lock(&sg_policy->work_lock);
- __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
- CPUFREQ_RELATION_L);
+ __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
mutex_unlock(&sg_policy->work_lock);
-
- sg_policy->work_in_progress = false;
}
static void sugov_irq_work(struct irq_work *irq_work)
@@ -390,19 +496,6 @@ static void sugov_irq_work(struct irq_work *irq_work)
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- /*
- * For RT tasks, the schedutil governor shoots the frequency to maximum.
- * Special care must be taken to ensure that this kthread doesn't result
- * in the same behavior.
- *
- * This is (mostly) guaranteed by the work_in_progress flag. The flag is
- * updated only at the end of the sugov_work() function and before that
- * the schedutil governor rejects all other frequency scaling requests.
- *
- * There is a very rare case though, where the RT thread yields right
- * after the work_in_progress flag is cleared. The effects of that are
- * neglected for now.
- */
kthread_queue_work(&sg_policy->worker, &sg_policy->work);
}
@@ -423,8 +516,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
return sprintf(buf, "%u\n", tunables->rate_limit_us);
}
-static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
- size_t count)
+static ssize_t
+rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
{
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
struct sugov_policy *sg_policy;
@@ -479,11 +572,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
{
struct task_struct *thread;
struct sched_attr attr = {
- .size = sizeof(struct sched_attr),
- .sched_policy = SCHED_DEADLINE,
- .sched_flags = SCHED_FLAG_SUGOV,
- .sched_nice = 0,
- .sched_priority = 0,
+ .size = sizeof(struct sched_attr),
+ .sched_policy = SCHED_DEADLINE,
+ .sched_flags = SCHED_FLAG_SUGOV,
+ .sched_nice = 0,
+ .sched_priority = 0,
/*
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
@@ -517,11 +610,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
sg_policy->thread = thread;
-
- /* Kthread is bound to all CPUs by default */
- if (!policy->dvfs_possible_from_any_cpu)
- kthread_bind_mask(thread, policy->related_cpus);
-
+ kthread_bind_mask(thread, policy->related_cpus);
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
@@ -625,10 +714,9 @@ fail:
stop_kthread:
sugov_kthread_stop(sg_policy);
-
-free_sg_policy:
mutex_unlock(&global_tunables_lock);
+free_sg_policy:
sugov_policy_free(sg_policy);
disable_fast_switch:
@@ -663,21 +751,20 @@ static int sugov_start(struct cpufreq_policy *policy)
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
- sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
- sg_policy->last_freq_update_time = 0;
- sg_policy->next_freq = UINT_MAX;
- sg_policy->work_in_progress = false;
- sg_policy->need_freq_update = false;
- sg_policy->cached_raw_freq = 0;
+ sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+ sg_policy->last_freq_update_time = 0;
+ sg_policy->next_freq = 0;
+ sg_policy->work_in_progress = false;
+ sg_policy->need_freq_update = false;
+ sg_policy->cached_raw_freq = 0;
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
memset(sg_cpu, 0, sizeof(*sg_cpu));
- sg_cpu->cpu = cpu;
- sg_cpu->sg_policy = sg_policy;
- sg_cpu->flags = 0;
- sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ sg_cpu->cpu = cpu;
+ sg_cpu->sg_policy = sg_policy;
+ sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
}
for_each_cpu(cpu, policy->cpus) {
@@ -721,14 +808,14 @@ static void sugov_limits(struct cpufreq_policy *policy)
}
static struct cpufreq_governor schedutil_gov = {
- .name = "schedutil",
- .owner = THIS_MODULE,
- .dynamic_switching = true,
- .init = sugov_init,
- .exit = sugov_exit,
- .start = sugov_start,
- .stop = sugov_stop,
- .limits = sugov_limits,
+ .name = "schedutil",
+ .owner = THIS_MODULE,
+ .dynamic_switching = true,
+ .init = sugov_init,
+ .exit = sugov_exit,
+ .start = sugov_start,
+ .stop = sugov_stop,
+ .limits = sugov_limits,
};
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 2511aba36b89..daaadf939ccb 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -14,7 +14,7 @@
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
- * a 2 dimensional bitmap (the first for priority class, the second for cpus
+ * a 2 dimensional bitmap (the first for priority class, the second for CPUs
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
@@ -26,12 +26,7 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
-
-#include <linux/gfp.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/slab.h>
-#include "cpupri.h"
+#include "sched.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
static int convert_prio(int prio)
@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
}
/**
- * cpupri_set - update the cpu priority setting
+ * cpupri_set - update the CPU priority setting
* @cp: The cpupri context
- * @cpu: The target cpu
+ * @cpu: The target CPU
* @newpri: The priority (INVALID-RT99) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
return;
/*
- * If the cpu was currently mapped to a different value, we
+ * If the CPU was currently mapped to a different value, we
* need to map it to the new value then remove the old value.
* Note, we must add the new value first, otherwise we risk the
* cpu being missed by the priority loop in cpupri_find.
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index bab050019071..7dc20a3232e7 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,32 +1,25 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_CPUPRI_H
-#define _LINUX_CPUPRI_H
-
-#include <linux/sched.h>
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
-#define CPUPRI_INVALID -1
-#define CPUPRI_IDLE 0
-#define CPUPRI_NORMAL 1
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE 0
+#define CPUPRI_NORMAL 1
/* values 2-101 are RT priorities 0-99 */
struct cpupri_vec {
- atomic_t count;
- cpumask_var_t mask;
+ atomic_t count;
+ cpumask_var_t mask;
};
struct cpupri {
- struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
- int *cpu_to_pri;
+ struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+ int *cpu_to_pri;
};
#ifdef CONFIG_SMP
-int cpupri_find(struct cpupri *cp,
- struct task_struct *p, struct cpumask *lowest_mask);
+int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
void cpupri_set(struct cpupri *cp, int cpu, int pri);
-int cpupri_init(struct cpupri *cp);
+int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
#endif
-
-#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bac6ac9a4ec7..0796f938c4f0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,10 +1,6 @@
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kernel_stat.h>
-#include <linux/static_key.h>
-#include <linux/context_tracking.h>
-#include <linux/sched/cputime.h>
+/*
+ * Simple CPU accounting cgroup controller
+ */
#include "sched.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
}
/*
- * Account user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
+ * Account user CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in user space since the last update
*/
void account_user_time(struct task_struct *p, u64 cputime)
{
@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
}
/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
+ * Account guest CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in virtual machine since the last update
*/
void account_guest_time(struct task_struct *p, u64 cputime)
{
@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
}
/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
+ * Account system CPU time to a process and desired cpustat field
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in kernel space since the last update
* @index: pointer to cpustat field that has to be updated
*/
void account_system_index_time(struct task_struct *p,
@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p,
}
/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
+ * Account system CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime: the CPU time spent in kernel space since the last update
*/
void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
{
@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
/*
* Account for involuntary wait time.
- * @cputime: the cpu time spent in involuntary wait
+ * @cputime: the CPU time spent in involuntary wait
*/
void account_steal_time(u64 cputime)
{
@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime)
/*
* Account for idle time.
- * @cputime: the cpu time spent in idle wait
+ * @cputime: the CPU time spent in idle wait
*/
void account_idle_time(u64 cputime)
{
@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
* Account a tick to a process and cpustat
- * @p: the process that the cpu time gets accounted to
+ * @p: the process that the CPU time gets accounted to
* @user_tick: is the tick from userspace
* @rq: the pointer to rq
*
@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks)
irqtime_account_process_tick(current, 0, rq, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static inline void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int nr_ticks) {}
+ struct rq *rq, int nr_ticks) { }
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_common_task_switch(struct task_struct *prev)
{
if (is_idle_task(prev))
@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev)
vtime_flush(prev);
arch_vtime_task_switch(prev);
}
-#endif
-
+# endif
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
*ut = cputime.utime;
*st = cputime.stime;
}
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
+
/*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
+ * Account a single tick of CPU time.
+ * @p: the process that the CPU time gets accounted to
* @user_tick: indicates if the tick is a user or a system tick
*/
void account_process_tick(struct task_struct *p, int user_tick)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9df09782025c..fbfc3f1d368a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,9 +17,6 @@
*/
#include "sched.h"
-#include <linux/slab.h>
-#include <uapi/linux/sched/types.h>
-
struct dl_bandwidth def_dl_bandwidth;
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
}
static inline
@@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
if (dl_rq->running_bw > old)
dl_rq->running_bw = 0;
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
}
static inline
@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
static void push_dl_tasks(struct rq *);
static void pull_dl_task(struct rq *);
-static inline void queue_push_tasks(struct rq *rq)
+static inline void deadline_queue_push_tasks(struct rq *rq)
{
if (!has_pushable_dl_tasks(rq))
return;
@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq)
queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
}
-static inline void queue_pull_task(struct rq *rq)
+static inline void deadline_queue_pull_task(struct rq *rq)
{
queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
}
@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
/*
* If we cannot preempt any rq, fall back to pick any
- * online cpu.
+ * online CPU:
*/
cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
if (cpu >= nr_cpu_ids) {
/*
- * Fail to find any suitable cpu.
+ * Failed to find any suitable CPU.
* The task will never come back!
*/
BUG_ON(dl_bandwidth_enabled());
@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq)
{
}
-static inline void queue_push_tasks(struct rq *rq)
+static inline void deadline_queue_push_tasks(struct rq *rq)
{
}
-static inline void queue_pull_task(struct rq *rq)
+static inline void deadline_queue_pull_task(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
- int flags);
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
/*
* We are being explicitly informed that a new instance is starting,
@@ -1121,7 +1117,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
* should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
* So, overflow is not an issue here.
*/
-u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
u64 u_act;
@@ -1263,6 +1259,9 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
rq = task_rq_lock(p, &rf);
+ sched_clock_tick();
+ update_rq_clock(rq);
+
if (!dl_task(p) || p->state == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
@@ -1282,9 +1281,6 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
if (dl_se->dl_non_contending == 0)
goto unlock;
- sched_clock_tick();
- update_rq_clock(rq);
-
sub_running_bw(dl_se, &rq->dl);
dl_se->dl_non_contending = 0;
unlock:
@@ -1564,7 +1560,7 @@ static void yield_task_dl(struct rq *rq)
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
- rq_clock_skip_update(rq, true);
+ rq_clock_skip_update(rq);
}
#ifdef CONFIG_SMP
@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
- queue_push_tasks(rq);
+ deadline_queue_push_tasks(rq);
return p;
}
@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
enqueue_pushable_dl_task(rq, p);
}
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task)
/*
* We have to consider system topology and task affinity
- * first, then we can look for a suitable cpu.
+ * first, then we can look for a suitable CPU.
*/
if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
return -1;
@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task)
* Now we check how well this matches with task's
* affinity and system topology.
*
- * The last cpu where the task run is our first
+ * The last CPU where the task run is our first
* guess, since it is most likely cache-hot there.
*/
if (cpumask_test_cpu(cpu, later_mask))
@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task)
best_cpu = cpumask_first_and(later_mask,
sched_domain_span(sd));
/*
- * Last chance: if a cpu being in both later_mask
+ * Last chance: if a CPU being in both later_mask
* and current sd span is valid, that becomes our
- * choice. Of course, the latest possible cpu is
+ * choice. Of course, the latest possible CPU is
* already under consideration through later_mask.
*/
if (best_cpu < nr_cpu_ids) {
@@ -2067,7 +2071,7 @@ retry:
if (task == next_task) {
/*
* The task is still there. We don't try
- * again, some other cpu will pull it when ready.
+ * again, some other CPU will pull it when ready.
*/
goto out;
}
@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
- * from an overloaded cpu, if any.
+ * from an overloaded CPU, if any.
*/
if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
return;
- queue_pull_task(rq);
+ deadline_queue_pull_task(rq);
}
/*
@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
- queue_push_tasks(rq);
+ deadline_queue_push_tasks(rq);
#endif
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
* or lowering its prio, so...
*/
if (!rq->dl.overloaded)
- queue_pull_task(rq);
+ deadline_queue_pull_task(rq);
/*
* If we now have a earlier deadline task than p,
@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p)
{
struct sched_dl_entity *dl_se = &p->dl;
- dl_se->dl_runtime = 0;
- dl_se->dl_deadline = 0;
- dl_se->dl_period = 0;
- dl_se->flags = 0;
- dl_se->dl_bw = 0;
- dl_se->dl_density = 0;
+ dl_se->dl_runtime = 0;
+ dl_se->dl_deadline = 0;
+ dl_se->dl_period = 0;
+ dl_se->flags = 0;
+ dl_se->dl_bw = 0;
+ dl_se->dl_density = 0;
- dl_se->dl_throttled = 0;
- dl_se->dl_yielded = 0;
- dl_se->dl_non_contending = 0;
- dl_se->dl_overrun = 0;
+ dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
+ dl_se->dl_non_contending = 0;
+ dl_se->dl_overrun = 0;
}
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
#ifdef CONFIG_SMP
int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
{
- unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
- cs_cpus_allowed);
+ unsigned int dest_cpu;
struct dl_bw *dl_b;
bool overflow;
int cpus, ret;
unsigned long flags;
+ dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
+
rcu_read_lock_sched();
dl_b = dl_bw_of(dest_cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
cpus = dl_bw_cpus(dest_cpu);
overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
- if (overflow)
+ if (overflow) {
ret = -EBUSY;
- else {
+ } else {
/*
* We reserve space for this task in the destination
* root_domain, as we can't fail after this point.
@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
}
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
+
return ret;
}
@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
ret = 0;
raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
rcu_read_unlock_sched();
+
return ret;
}
@@ -2718,13 +2725,12 @@ bool dl_cpu_busy(unsigned int cpu)
overflow = __dl_overflow(dl_b, cpus, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
+
return overflow;
}
#endif
#ifdef CONFIG_SCHED_DEBUG
-extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
-
void print_dl_stats(struct seq_file *m, int cpu)
{
print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1ca0130ed4f9..e593b4118578 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,7 +1,7 @@
/*
* kernel/sched/debug.c
*
- * Print the CFS rbtree
+ * Print the CFS rbtree and other debugging details
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*
@@ -9,16 +9,6 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
-#include <linux/proc_fs.h>
-#include <linux/sched/mm.h>
-#include <linux/sched/task.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-#include <linux/mempolicy.h>
-#include <linux/debugfs.h>
-
#include "sched.h"
static DEFINE_SPINLOCK(sched_debug_lock);
@@ -32,7 +22,7 @@ static DEFINE_SPINLOCK(sched_debug_lock);
if (m) \
seq_printf(m, x); \
else \
- printk(x); \
+ pr_cont(x); \
} while (0)
/*
@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
if (table == NULL)
return NULL;
- set_table_entry(&table[0], "min_interval", &sd->min_interval,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[1], "max_interval", &sd->max_interval,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[9], "cache_nice_tries",
- &sd->cache_nice_tries,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[10], "flags", &sd->flags,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[11], "max_newidle_lb_cost",
- &sd->max_newidle_lb_cost,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[12], "name", sd->name,
- CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+ set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
+ set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
+ set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
+ set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
+ set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
+ set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */
return table;
@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
return table;
}
-static cpumask_var_t sd_sysctl_cpus;
-static struct ctl_table_header *sd_sysctl_header;
+static cpumask_var_t sd_sysctl_cpus;
+static struct ctl_table_header *sd_sysctl_header;
void register_sched_domain_sysctl(void)
{
@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
{
struct sched_entity *se = tg->se[cpu];
-#define P(F) \
- SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
-#define P_SCHEDSTAT(F) \
- SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
-#define PN(F) \
- SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN_SCHEDSTAT(F) \
- SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
+#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
+#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
if (!se)
return;
@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
+
if (schedstat_enabled()) {
PN_SCHEDSTAT(se->statistics.wait_start);
PN_SCHEDSTAT(se->statistics.sleep_start);
@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN_SCHEDSTAT(se->statistics.wait_sum);
P_SCHEDSTAT(se->statistics.wait_count);
}
+
P(se->load.weight);
P(se->runnable_weight);
#ifdef CONFIG_SMP
@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg)
return group_path;
cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+
return group_path;
}
#endif
@@ -501,12 +475,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
{
struct task_struct *g, *p;
- SEQ_printf(m,
- "\nrunnable tasks:\n"
- " S task PID tree-key switches prio"
- " wait-time sum-exec sum-sleep\n"
- "-------------------------------------------------------"
- "----------------------------------------------------\n");
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "runnable tasks:\n");
+ SEQ_printf(m, " S task PID tree-key switches prio"
+ " wait-time sum-exec sum-sleep\n");
+ SEQ_printf(m, "-------------------------------------------------------"
+ "----------------------------------------------------\n");
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -527,9 +501,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
- SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
#else
- SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
@@ -567,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.runnable_load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
+ SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
+ cfs_rq->avg.util_est.enqueued);
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -595,9 +573,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#ifdef CONFIG_RT_GROUP_SCHED
- SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
#else
- SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "rt_rq[%d]:\n", cpu);
#endif
#define P(x) \
@@ -624,7 +604,8 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
{
struct dl_bw *dl_bw;
- SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "dl_rq[%d]:\n", cpu);
#define PU(x) \
SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
@@ -799,9 +780,9 @@ void sysrq_sched_debug_show(void)
/*
* This itererator needs some explanation.
* It returns 1 for the header position.
- * This means 2 is cpu 0.
- * In a hotplugged system some cpus, including cpu 0, may be missing so we have
- * to use cpumask_* to iterate over the cpus.
+ * This means 2 is CPU 0.
+ * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
+ * to use cpumask_* to iterate over the CPUs.
*/
static void *sched_debug_start(struct seq_file *file, loff_t *offset)
{
@@ -821,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
if (n < nr_cpu_ids)
return (void *)(unsigned long)(n + 2);
+
return NULL;
}
@@ -835,55 +817,25 @@ static void sched_debug_stop(struct seq_file *file, void *data)
}
static const struct seq_operations sched_debug_sops = {
- .start = sched_debug_start,
- .next = sched_debug_next,
- .stop = sched_debug_stop,
- .show = sched_debug_show,
-};
-
-static int sched_debug_release(struct inode *inode, struct file *file)
-{
- seq_release(inode, file);
-
- return 0;
-}
-
-static int sched_debug_open(struct inode *inode, struct file *filp)
-{
- int ret = 0;
-
- ret = seq_open(filp, &sched_debug_sops);
-
- return ret;
-}
-
-static const struct file_operations sched_debug_fops = {
- .open = sched_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = sched_debug_release,
+ .start = sched_debug_start,
+ .next = sched_debug_next,
+ .stop = sched_debug_stop,
+ .show = sched_debug_show,
};
static int __init init_sched_debug_procfs(void)
{
- struct proc_dir_entry *pe;
-
- pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
- if (!pe)
+ if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops))
return -ENOMEM;
return 0;
}
__initcall(init_sched_debug_procfs);
-#define __P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
-#define P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
-#define __PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#ifdef CONFIG_NUMA_BALANCING
@@ -1018,6 +970,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
+ P(se.avg.util_est.ewma);
+ P(se.avg.util_est.enqueued);
#endif
P(policy);
P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eb3ffc9be84..e497c05aab7f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,25 +20,10 @@
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
-
-#include <linux/sched/mm.h>
-#include <linux/sched/topology.h>
-
-#include <linux/latencytop.h>
-#include <linux/cpumask.h>
-#include <linux/cpuidle.h>
-#include <linux/slab.h>
-#include <linux/profile.h>
-#include <linux/interrupt.h>
-#include <linux/mempolicy.h>
-#include <linux/migrate.h>
-#include <linux/task_work.h>
-#include <linux/sched/isolation.h>
+#include "sched.h"
#include <trace/events/sched.h>
-#include "sched.h"
-
/*
* Targeted preemption latency for CPU-bound tasks:
*
@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
#ifdef CONFIG_SMP
/*
- * For asym packing, by default the lower numbered cpu has higher priority.
+ * For asym packing, by default the lower numbered CPU has higher priority.
*/
int __weak arch_asym_cpu_priority(int cpu)
{
@@ -787,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se);
+ attach_entity_load_avg(cfs_rq, se, 0);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
@@ -1154,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax);
}
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+ int mm_users = 0;
+ struct mm_struct *mm = p->mm;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1) {
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ mm->numa_scan_seq = 0;
+ }
+ }
+ p->node_stamp = 0;
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ p->numa_group = NULL;
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ /* New address space, reset the preferred nid */
+ if (!(clone_flags & CLONE_VM)) {
+ p->numa_preferred_nid = -1;
+ return;
+ }
+
+ /*
+ * New thread, keep existing numa_preferred_nid which should be copied
+ * already by arch_dup_task_struct but stagger when scans start.
+ */
+ if (mm) {
+ unsigned int delay;
+
+ delay = min_t(unsigned int, task_scan_max(current),
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+ delay += 2 * TICK_NSEC;
+ p->node_stamp = delay;
+ }
+}
+
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -1181,7 +1207,7 @@ pid_t task_numa_group_id(struct task_struct *p)
}
/*
- * The averaged statistics, shared & private, memory & cpu,
+ * The averaged statistics, shared & private, memory & CPU,
* occupy the first half of the array. The second half of the
* array is for current counters, which are averaged into the
* first set by task_numa_placement.
@@ -1587,7 +1613,7 @@ static void task_numa_compare(struct task_numa_env *env,
* be incurred if the tasks were swapped.
*/
if (cur) {
- /* Skip this swap candidate if cannot move to the source cpu */
+ /* Skip this swap candidate if cannot move to the source CPU: */
if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
goto unlock;
@@ -1631,7 +1657,7 @@ static void task_numa_compare(struct task_numa_env *env,
goto balance;
}
- /* Balance doesn't matter much if we're running a task per cpu */
+ /* Balance doesn't matter much if we're running a task per CPU: */
if (imp > env->best_imp && src_rq->nr_running == 1 &&
dst_rq->nr_running == 1)
goto assign;
@@ -1676,7 +1702,7 @@ balance:
*/
if (!cur) {
/*
- * select_idle_siblings() uses an per-cpu cpumask that
+ * select_idle_siblings() uses an per-CPU cpumask that
* can be used from IRQ context.
*/
local_irq_disable();
@@ -2823,7 +2849,7 @@ void reweight_task(struct task_struct *p, int prio)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-# ifdef CONFIG_SMP
+#ifdef CONFIG_SMP
/*
* All this does is approximate the hierarchical proportion which includes that
* global sum we all love to hate.
@@ -2974,7 +3000,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
return clamp_t(long, runnable, MIN_SHARES, shares);
}
-# endif /* CONFIG_SMP */
+#endif /* CONFIG_SMP */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -3012,11 +3038,11 @@ static inline void update_cfs_group(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
- if (&rq->cfs == cfs_rq) {
+ if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -3031,7 +3057,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
*
* See cpu_util().
*/
- cpufreq_update_util(rq, 0);
+ cpufreq_update_util(rq, flags);
}
}
@@ -3246,6 +3272,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
}
/*
+ * When a task is dequeued, its estimated utilization should not be update if
+ * its util_avg has not been updated at least once.
+ * This flag is used to synchronize util_avg updates with util_est updates.
+ * We map this information into the LSB bit of the utilization saved at
+ * dequeue time (i.e. util_est.dequeued).
+ */
+#define UTIL_AVG_UNCHANGED 0x1
+
+static inline void cfs_se_util_change(struct sched_avg *avg)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Avoid store if the flag has been already set */
+ enqueued = avg->util_est.enqueued;
+ if (!(enqueued & UTIL_AVG_UNCHANGED))
+ return;
+
+ /* Reset flag to report util_avg has been updated */
+ enqueued &= ~UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(avg->util_est.enqueued, enqueued);
+}
+
+/*
* sched_entity:
*
* task:
@@ -3296,6 +3348,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit
cfs_rq->curr == se)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ cfs_se_util_change(&se->avg);
return 1;
}
@@ -3350,7 +3403,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
}
/*
- * Called within set_task_rq() right before setting a task's cpu. The
+ * Called within set_task_rq() right before setting a task's CPU. The
* caller only guarantees p->pi_lock is held; no other assumptions,
* including the state of rq->lock, should be made.
*/
@@ -3529,7 +3582,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/*
* runnable_sum can't be lower than running_sum
- * As running sum is scale with cpu capacity wehreas the runnable sum
+ * As running sum is scale with CPU capacity wehreas the runnable sum
* is not we rescale running_sum 1st
*/
running_sum = se->avg.util_sum /
@@ -3689,7 +3742,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
#endif
if (decayed)
- cfs_rq_util_change(cfs_rq);
+ cfs_rq_util_change(cfs_rq, 0);
return decayed;
}
@@ -3702,7 +3755,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
@@ -3738,7 +3791,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
- cfs_rq_util_change(cfs_rq);
+ cfs_rq_util_change(cfs_rq, flags);
}
/**
@@ -3757,7 +3810,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
- cfs_rq_util_change(cfs_rq);
+ cfs_rq_util_change(cfs_rq, 0);
}
/*
@@ -3787,7 +3840,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
- attach_entity_load_avg(cfs_rq, se);
+ /*
+ * DO_ATTACH means we're here from enqueue_entity().
+ * !last_update_time means we've passed through
+ * migrate_task_rq_fair() indicating we migrated.
+ *
+ * IOW we're enqueueing a task on a new CPU.
+ */
+ attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
} else if (decayed && (flags & UPDATE_TG))
@@ -3869,6 +3929,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
+static inline unsigned long task_util(struct task_struct *p)
+{
+ return READ_ONCE(p->se.avg.util_avg);
+}
+
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+ struct util_est ue = READ_ONCE(p->se.avg.util_est);
+
+ return max(ue.ewma, ue.enqueued);
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+ return max(task_util(p), _task_util_est(p));
+}
+
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Update root cfs_rq's estimated utilization */
+ enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+}
+
+/*
+ * Check if a (signed) value is within a specified (unsigned) margin,
+ * based on the observation that:
+ *
+ * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
+ *
+ * NOTE: this only works when value + maring < INT_MAX.
+ */
+static inline bool within_margin(int value, int margin)
+{
+ return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+}
+
+static void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+{
+ long last_ewma_diff;
+ struct util_est ue;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /*
+ * Update root cfs_rq's estimated utilization
+ *
+ * If *p is the last task then the root cfs_rq's estimated utilization
+ * of a CPU is 0 by definition.
+ */
+ ue.enqueued = 0;
+ if (cfs_rq->nr_running) {
+ ue.enqueued = cfs_rq->avg.util_est.enqueued;
+ ue.enqueued -= min_t(unsigned int, ue.enqueued,
+ (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+ }
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+
+ /*
+ * Skip update of task's estimated utilization when the task has not
+ * yet completed an activation, e.g. being migrated.
+ */
+ if (!task_sleep)
+ return;
+
+ /*
+ * If the PELT values haven't changed since enqueue time,
+ * skip the util_est update.
+ */
+ ue = p->se.avg.util_est;
+ if (ue.enqueued & UTIL_AVG_UNCHANGED)
+ return;
+
+ /*
+ * Skip update of task's estimated utilization when its EWMA is
+ * already ~1% close to its last activation value.
+ */
+ ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ last_ewma_diff = ue.enqueued - ue.ewma;
+ if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+ return;
+
+ /*
+ * Update Task's estimated utilization
+ *
+ * When *p completes an activation we can consolidate another sample
+ * of the task size. This is done by storing the current PELT value
+ * as ue.enqueued and by using this value to update the Exponential
+ * Weighted Moving Average (EWMA):
+ *
+ * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
+ * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
+ * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
+ * = w * ( last_ewma_diff ) + ewma(t-1)
+ * = w * (last_ewma_diff + ewma(t-1) / w)
+ *
+ * Where 'w' is the weight of new samples, which is configured to be
+ * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+ */
+ ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ue.ewma += last_ewma_diff;
+ ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+ WRITE_ONCE(p->se.avg.util_est, ue);
+}
+
#else /* CONFIG_SMP */
static inline int
@@ -3883,13 +4057,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
- cfs_rq_util_change(cfs_rq);
+ cfs_rq_util_change(cfs_rq, 0);
}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
@@ -3898,6 +4072,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
return 0;
}
+static inline void
+util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
+ bool task_sleep) {}
+
#endif /* CONFIG_SMP */
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4676,7 +4857,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se)
add_nr_running(rq, task_delta);
- /* determine whether we need to wake up potentially idle cpu */
+ /* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
}
@@ -5041,7 +5222,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
}
/*
- * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
*
* The race is harmless, since modifying bandwidth settings of unhooked group
* bits doesn't do much.
@@ -5086,7 +5267,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
*/
cfs_rq->runtime_remaining = 1;
/*
- * Offline rq is schedulable till cpu is completely disabled
+ * Offline rq is schedulable till CPU is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
@@ -5205,6 +5386,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
/*
+ * The code below (indirectly) updates schedutil which looks at
+ * the cfs_rq utilization to select a frequency.
+ * Let's add the task's estimated utilization to the cfs_rq's
+ * estimated utilization, before we update schedutil.
+ */
+ util_est_enqueue(&rq->cfs, p);
+
+ /*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
* passed.
@@ -5304,6 +5493,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
sub_nr_running(rq, 1);
+ util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -5323,8 +5513,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
*
* load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
*
- * If a cpu misses updates for n ticks (as it was idle) and update gets
- * called on the n+1-th tick when cpu may be busy, then we have:
+ * If a CPU misses updates for n ticks (as it was idle) and update gets
+ * called on the n+1-th tick when CPU may be busy, then we have:
*
* load_n = (1 - 1/2^i)^n * load_0
* load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
@@ -5379,6 +5569,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
}
return load;
}
+
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ int has_blocked; /* Idle CPUS has blocked load */
+ unsigned long next_balance; /* in jiffy units */
+ unsigned long next_blocked; /* Next update of blocked load in jiffies */
+} nohz ____cacheline_aligned;
+
#endif /* CONFIG_NO_HZ_COMMON */
/**
@@ -5468,7 +5667,7 @@ static unsigned long weighted_cpuload(struct rq *rq)
#ifdef CONFIG_NO_HZ_COMMON
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
*
* Therefore we need to avoid the delta approach from the regular tick when
@@ -5579,7 +5778,7 @@ void cpu_load_update_active(struct rq *this_rq)
}
/*
- * Return a low guess at the load of a migration-source cpu weighted
+ * Return a low guess at the load of a migration-source CPU weighted
* according to the scheduling class and "nice" value.
*
* We want to under-estimate the load of migration sources, to
@@ -5597,7 +5796,7 @@ static unsigned long source_load(int cpu, int type)
}
/*
- * Return a high guess at the load of a migration-target cpu weighted
+ * Return a high guess at the load of a migration-target CPU weighted
* according to the scheduling class and "nice" value.
*/
static unsigned long target_load(int cpu, int type)
@@ -5707,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
* a cpufreq perspective, it's better to have higher utilisation
* on one CPU.
*/
- if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
- return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
+ if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+ return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu;
@@ -5724,7 +5923,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
unsigned long task_load;
this_eff_load = target_load(this_cpu, sd->wake_idx);
- prev_eff_load = source_load(prev_cpu, sd->wake_idx);
if (sync) {
unsigned long current_load = task_h_load(current);
@@ -5742,18 +5940,27 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
+ prev_eff_load = source_load(prev_cpu, sd->wake_idx);
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
- return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
+ /*
+ * If sync, adjust the weight of prev_eff_load such that if
+ * prev_eff == this_eff that select_idle_sibling() will consider
+ * stacking the wakee on top of the waker if no other CPU is
+ * idle.
+ */
+ if (sync)
+ prev_eff_load += 1;
+
+ return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
- int prev_cpu, int sync)
+ int this_cpu, int prev_cpu, int sync)
{
- int this_cpu = smp_processor_id();
int target = nr_cpumask_bits;
if (sched_feat(WA_IDLE))
@@ -5771,7 +5978,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return target;
}
-static inline unsigned long task_util(struct task_struct *p);
static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -5826,7 +6032,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
max_spare_cap = 0;
for_each_cpu(i, sched_group_span(group)) {
- /* Bias balancing toward cpus of our domain */
+ /* Bias balancing toward CPUs of our domain */
if (local_group)
load = source_load(i, load_idx);
else
@@ -5856,7 +6062,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (min_runnable_load > (runnable_load + imbalance)) {
/*
* The runnable load is significantly smaller
- * so we can pick this new cpu
+ * so we can pick this new CPU:
*/
min_runnable_load = runnable_load;
min_avg_load = avg_load;
@@ -5865,7 +6071,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
(100*min_avg_load > imbalance_scale*avg_load)) {
/*
* The runnable loads are close so take the
- * blocked load into account through avg_load.
+ * blocked load into account through avg_load:
*/
min_avg_load = avg_load;
idlest = group;
@@ -5903,6 +6109,18 @@ skip_spare:
if (!idlest)
return NULL;
+ /*
+ * When comparing groups across NUMA domains, it's possible for the
+ * local domain to be very lightly loaded relative to the remote
+ * domains but "imbalance" skews the comparison making remote CPUs
+ * look much more favourable. When considering cross-domain, add
+ * imbalance to the runnable load on the remote node and consider
+ * staying local.
+ */
+ if ((sd->flags & SD_NUMA) &&
+ min_runnable_load + imbalance >= this_runnable_load)
+ return NULL;
+
if (min_runnable_load > (this_runnable_load + imbalance))
return NULL;
@@ -5914,7 +6132,7 @@ skip_spare:
}
/*
- * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
*/
static int
find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -5932,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
- if (idle_cpu(i)) {
+ if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -5974,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
return prev_cpu;
+ /*
+ * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
+ * last_update_time.
+ */
+ if (!(sd_flag & SD_BALANCE_FORK))
+ sync_entity_load_avg(&p->se);
+
while (sd) {
struct sched_group *group;
struct sched_domain *tmp;
@@ -5992,12 +6217,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
new_cpu = find_idlest_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
+ /* Now try balancing at a lower domain level of 'cpu': */
sd = sd->child;
continue;
}
- /* Now try balancing at a lower domain level of new_cpu */
+ /* Now try balancing at a lower domain level of 'new_cpu': */
cpu = new_cpu;
weight = sd->span_weight;
sd = NULL;
@@ -6007,7 +6232,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
if (tmp->flags & sd_flag)
sd = tmp;
}
- /* while loop will break here if sd == NULL */
}
return new_cpu;
@@ -6055,7 +6279,7 @@ void __update_idle_core(struct rq *rq)
if (cpu == core)
continue;
- if (!idle_cpu(cpu))
+ if (!available_idle_cpu(cpu))
goto unlock;
}
@@ -6087,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
for_each_cpu(cpu, cpu_smt_mask(core)) {
cpumask_clear_cpu(cpu, cpus);
- if (!idle_cpu(cpu))
+ if (!available_idle_cpu(cpu))
idle = false;
}
@@ -6116,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
- if (idle_cpu(cpu))
+ if (available_idle_cpu(cpu))
return cpu;
}
@@ -6179,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
return -1;
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
- if (idle_cpu(cpu))
+ if (available_idle_cpu(cpu))
break;
}
@@ -6199,25 +6423,25 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
- if (idle_cpu(target))
+ if (available_idle_cpu(target))
return target;
/*
- * If the previous cpu is cache affine and idle, don't be stupid.
+ * If the previous CPU is cache affine and idle, don't be stupid:
*/
- if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+ if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
return prev;
- /* Check a recently used CPU as a potential idle candidate */
+ /* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- idle_cpu(recent_used_cpu) &&
+ available_idle_cpu(recent_used_cpu) &&
cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
/*
* Replace recent_used_cpu with prev as it is a potential
- * candidate for the next wake.
+ * candidate for the next wake:
*/
p->recent_used_cpu = prev;
return recent_used_cpu;
@@ -6242,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
+/**
+ * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
+ * @cpu: the CPU to get the utilization of
+ *
+ * The unit of the return value must be the one of capacity so we can compare
+ * the utilization with the capacity of the CPU that is available for CFS task
+ * (ie cpu_capacity).
*
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
@@ -6257,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
+ * The estimated utilization of a CPU is defined to be the maximum between its
+ * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
+ * currently RUNNABLE on that CPU.
+ * This allows to properly represent the expected utilization of a CPU which
+ * has just got a big task running since a long sleep period. At the same time
+ * however it preserves the benefits of the "blocked utilization" in
+ * describing the potential for other tasks waking up on the same CPU.
+ *
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
@@ -6267,36 +6501,77 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* available capacity. We allow utilization to overshoot capacity_curr (but not
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
+ *
+ * Return: the (estimated) utilization for the specified CPU
*/
-static unsigned long cpu_util(int cpu)
+static inline unsigned long cpu_util(int cpu)
{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- unsigned long capacity = capacity_orig_of(cpu);
+ struct cfs_rq *cfs_rq;
+ unsigned int util;
- return (util >= capacity) ? capacity : util;
-}
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
-static inline unsigned long task_util(struct task_struct *p)
-{
- return p->se.avg.util_avg;
+ if (sched_feat(UTIL_EST))
+ util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+ return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
- * cpu_util_wake: Compute cpu utilization with any contributions from
+ * cpu_util_wake: Compute CPU utilization with any contributions from
* the waking task p removed.
*/
static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
{
- unsigned long util, capacity;
+ struct cfs_rq *cfs_rq;
+ unsigned int util;
/* Task has no contribution or is new */
- if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util(cpu);
- capacity = capacity_orig_of(cpu);
- util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
+
+ /* Discount task's blocked util from CPU's util */
+ util -= min_t(unsigned int, util, task_util(p));
- return (util >= capacity) ? capacity : util;
+ /*
+ * Covered cases:
+ *
+ * a) if *p is the only task sleeping on this CPU, then:
+ * cpu_util (== task_util) > util_est (== 0)
+ * and thus we return:
+ * cpu_util_wake = (cpu_util - task_util) = 0
+ *
+ * b) if other tasks are SLEEPING on this CPU, which is now exiting
+ * IDLE, then:
+ * cpu_util >= task_util
+ * cpu_util > util_est (== 0)
+ * and thus we discount *p's blocked utilization to return:
+ * cpu_util_wake = (cpu_util - task_util) >= 0
+ *
+ * c) if other tasks are RUNNABLE on that CPU and
+ * util_est > cpu_util
+ * then we use util_est since it returns a more restrictive
+ * estimation of the spare capacity on that CPU, by just
+ * considering the expected utilization of tasks already
+ * runnable on that CPU.
+ *
+ * Cases a) and b) are covered by the above code, while case c) is
+ * covered by the following code when estimated utilization is
+ * enabled.
+ */
+ if (sched_feat(UTIL_EST))
+ util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+ /*
+ * Utilization (estimated) can exceed the CPU capacity, thus let's
+ * clamp to the maximum CPU capacity to ensure consistency with
+ * the cpu_util call.
+ */
+ return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
@@ -6328,21 +6603,21 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
- * Balances load by selecting the idlest cpu in the idlest group, or under
- * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
+ * Balances load by selecting the idlest CPU in the idlest group, or under
+ * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
- * Returns the target cpu number.
+ * Returns the target CPU number.
*
* preempt must be disabled.
*/
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
- struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+ struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
- int sync = wake_flags & WF_SYNC;
+ int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
@@ -6356,12 +6631,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
break;
/*
- * If both cpu and prev_cpu are part of this domain,
+ * If both 'cpu' and 'prev_cpu' are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
- affine_sd = tmp;
+ if (cpu != prev_cpu)
+ new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
+
+ sd = NULL; /* Prefer wake_affine over balance flags */
break;
}
@@ -6371,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
break;
}
- if (affine_sd) {
- sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu == prev_cpu)
- goto pick_cpu;
-
- new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
- }
-
- if (sd && !(sd_flag & SD_BALANCE_FORK)) {
- /*
- * We're going to need the task's util for capacity_spare_wake
- * in find_idlest_group. Sync it up to prev_cpu's
- * last_update_time.
- */
- sync_entity_load_avg(&p->se);
- }
+ if (unlikely(sd)) {
+ /* Slow path */
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+ /* Fast path */
- if (!sd) {
-pick_cpu:
- if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
- new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- if (want_affine)
- current->recent_used_cpu = cpu;
- }
- } else {
- new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ if (want_affine)
+ current->recent_used_cpu = cpu;
}
rcu_read_unlock();
@@ -6407,9 +6668,9 @@ pick_cpu:
static void detach_entity_cfs_rq(struct sched_entity *se);
/*
- * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
+ * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
static void migrate_task_rq_fair(struct task_struct *p)
{
@@ -6738,7 +6999,7 @@ simple:
p = task_of(se);
-done: __maybe_unused
+done: __maybe_unused;
#ifdef CONFIG_SMP
/*
* Move the next running task to the front of
@@ -6814,7 +7075,7 @@ static void yield_task_fair(struct rq *rq)
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
- rq_clock_skip_update(rq, true);
+ rq_clock_skip_update(rq);
}
set_skip_buddy(se);
@@ -6843,17 +7104,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* BASICS
*
* The purpose of load-balancing is to achieve the same basic fairness the
- * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * per-CPU scheduler provides, namely provide a proportional amount of compute
* time to each task. This is expressed in the following equation:
*
* W_i,n/P_i == W_j,n/P_j for all i,j (1)
*
- * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
* W_i,0 is defined as:
*
* W_i,0 = \Sum_j w_i,j (2)
*
- * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
* is derived from the nice value as per sched_prio_to_weight[].
*
* The weight average is an exponential decay average of the instantaneous
@@ -6861,7 +7122,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
*
- * C_i is the compute capacity of cpu i, typically it is the
+ * C_i is the compute capacity of CPU i, typically it is the
* fraction of 'recent' time available for SCHED_OTHER task execution. But it
* can also include other factors [XXX].
*
@@ -6882,11 +7143,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* SCHED DOMAINS
*
* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
- * for all i,j solution, we create a tree of cpus that follows the hardware
+ * for all i,j solution, we create a tree of CPUs that follows the hardware
* topology where each level pairs two lower groups (or better). This results
- * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
* tree to only the first of the previous level and we decrease the frequency
- * of load-balance at each level inv. proportional to the number of cpus in
+ * of load-balance at each level inv. proportional to the number of CPUs in
* the groups.
*
* This yields:
@@ -6895,7 +7156,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* \Sum { --- * --- * 2^i } = O(n) (5)
* i = 0 2^i 2^i
* `- size of each group
- * | | `- number of cpus doing load-balance
+ * | | `- number of CPUs doing load-balance
* | `- freq
* `- sum over all levels
*
@@ -6903,7 +7164,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* this makes (5) the runtime complexity of the balancer.
*
* An important property here is that each CPU is still (indirectly) connected
- * to every other cpu in at most O(log n) steps:
+ * to every other CPU in at most O(log n) steps:
*
* The adjacency matrix of the resulting graph is given by:
*
@@ -6915,7 +7176,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* A^(log_2 n)_i,j != 0 for all i,j (7)
*
- * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * Showing there's indeed a path between every CPU in at most O(log n) steps.
* The task movement gives a factor of O(m), giving a convergence complexity
* of:
*
@@ -6925,7 +7186,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* WORK CONSERVING
*
* In order to avoid CPUs going idle while there's still work to do, new idle
- * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * balancing is more aggressive and has the newly idle CPU iterate up the domain
* tree itself instead of relying on other CPUs to bring it work.
*
* This adds some complexity to both (5) and (8) but it reduces the total idle
@@ -6946,7 +7207,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
*
- * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
*
* The big problem is S_k, its a global sum needed to compute a local (W_i)
* property.
@@ -6963,6 +7224,8 @@ enum fbq_type { regular, remote, all };
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
+#define LBF_NOHZ_STATS 0x10
+#define LBF_NOHZ_AGAIN 0x20
struct lb_env {
struct sched_domain *sd;
@@ -7110,7 +7373,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
env->flags |= LBF_SOME_PINNED;
/*
- * Remember if this task can be migrated to any other cpu in
+ * Remember if this task can be migrated to any other CPU in
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
@@ -7120,7 +7383,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
return 0;
- /* Prevent to re-select dst_cpu via env's cpus */
+ /* Prevent to re-select dst_cpu via env's CPUs: */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
env->flags |= LBF_DST_PINNED;
@@ -7347,6 +7610,17 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->avg.load_avg)
+ return true;
+
+ if (cfs_rq->avg.util_avg)
+ return true;
+
+ return false;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7371,6 +7645,7 @@ static void update_blocked_averages(int cpu)
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
struct rq_flags rf;
+ bool done = true;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -7400,7 +7675,17 @@ static void update_blocked_averages(int cpu)
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
+
+ /* Don't need periodic decay once load/util_avg are null */
+ if (cfs_rq_has_blocked(cfs_rq))
+ done = false;
}
+
+#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_blocked_load_update_tick = jiffies;
+ if (done)
+ rq->has_blocked_load = 0;
+#endif
rq_unlock_irqrestore(rq, &rf);
}
@@ -7460,6 +7745,11 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_blocked_load_update_tick = jiffies;
+ if (!cfs_rq_has_blocked(cfs_rq))
+ rq->has_blocked_load = 0;
+#endif
rq_unlock_irqrestore(rq, &rf);
}
@@ -7694,8 +7984,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
* Group imbalance indicates (and tries to solve) the problem where balancing
* groups is inadequate due to ->cpus_allowed constraints.
*
- * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
- * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
+ * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
* Something like:
*
* { 0 1 2 3 } { 4 5 6 7 }
@@ -7703,7 +7993,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
*
* If we were to balance group-wise we'd place two tasks in the first group and
* two tasks in the second group. Clearly this is undesired as it will overload
- * cpu 3 and leave one of the cpus in the second group unused.
+ * cpu 3 and leave one of the CPUs in the second group unused.
*
* The current solution to this issue is detecting the skew in the first group
* by noticing the lower domain failed to reach balance and had difficulty
@@ -7794,6 +8084,28 @@ group_type group_classify(struct sched_group *group,
return group_other;
}
+static bool update_nohz_stats(struct rq *rq, bool force)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned int cpu = rq->cpu;
+
+ if (!rq->has_blocked_load)
+ return false;
+
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return false;
+
+ if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
+ return true;
+
+ update_blocked_averages(cpu);
+
+ return rq->has_blocked_load;
+#else
+ return false;
+#endif
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -7816,7 +8128,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
- /* Bias balancing toward cpus of our domain */
+ if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
+ env->flags |= LBF_NOHZ_AGAIN;
+
+ /* Bias balancing toward CPUs of our domain: */
if (local_group)
load = target_load(i, load_idx);
else
@@ -7902,7 +8217,7 @@ asym_packing:
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
- /* No ASYM_PACKING if target cpu is already busy */
+ /* No ASYM_PACKING if target CPU is already busy */
if (env->idle == CPU_NOT_IDLE)
return true;
/*
@@ -7915,7 +8230,7 @@ asym_packing:
if (!sds->busiest)
return true;
- /* Prefer to move from lowest priority cpu's work */
+ /* Prefer to move from lowest priority CPU's work */
if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
sg->asym_prefer_cpu))
return true;
@@ -7971,6 +8286,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
+#ifdef CONFIG_NO_HZ_COMMON
+ if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
+ env->flags |= LBF_NOHZ_STATS;
+#endif
+
load_idx = get_sd_load_idx(env->sd, env->idle);
do {
@@ -8024,6 +8344,15 @@ next_group:
sg = sg->next;
} while (sg != env->sd->groups);
+#ifdef CONFIG_NO_HZ_COMMON
+ if ((env->flags & LBF_NOHZ_AGAIN) &&
+ cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+
+ WRITE_ONCE(nohz.next_blocked,
+ jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ }
+#endif
+
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
@@ -8168,7 +8497,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
- * to ensure cpu-load equilibrium, look at wider averages. XXX
+ * to ensure CPU-load equilibrium, look at wider averages. XXX
*/
busiest->load_per_task =
min(busiest->load_per_task, sds->avg_load);
@@ -8187,7 +8516,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
/*
- * If there aren't any idle cpus, avoid creating some.
+ * If there aren't any idle CPUs, avoid creating some.
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
@@ -8201,9 +8530,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
/*
- * We're trying to get all the cpus to the average_load, so we don't
+ * We're trying to get all the CPUs to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded cpu below the average load. At the same time,
+ * reduce the max loaded CPU below the average load. At the same time,
* we also don't want to reduce the group load below the group
* capacity. Thus we look for the minimum possible imbalance.
*/
@@ -8297,9 +8626,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (env->idle == CPU_IDLE) {
/*
- * This cpu is idle. If the busiest group is not overloaded
+ * This CPU is idle. If the busiest group is not overloaded
* and there is no imbalance between this and busiest group
- * wrt idle cpus, it is balanced. The imbalance becomes
+ * wrt idle CPUs, it is balanced. The imbalance becomes
* significant if the diff is greater than 1 otherwise we
* might end up to just move the imbalance on another group
*/
@@ -8327,7 +8656,7 @@ out_balanced:
}
/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
*/
static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
@@ -8371,7 +8700,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
/*
* When comparing with imbalance, use weighted_cpuload()
- * which is not scaled with the cpu capacity.
+ * which is not scaled with the CPU capacity.
*/
if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -8379,9 +8708,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
continue;
/*
- * For the load comparisons with the other cpu's, consider
- * the weighted_cpuload() scaled with the cpu capacity, so
- * that the load can be moved away from the cpu that is
+ * For the load comparisons with the other CPU's, consider
+ * the weighted_cpuload() scaled with the CPU capacity, so
+ * that the load can be moved away from the CPU that is
* potentially running at a lower capacity.
*
* Thus we're looking for max(wl_i / capacity_i), crosswise
@@ -8452,13 +8781,13 @@ static int should_we_balance(struct lb_env *env)
return 0;
/*
- * In the newly idle case, we will allow all the cpu's
+ * In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
*/
if (env->idle == CPU_NEWLY_IDLE)
return 1;
- /* Try to find first idle cpu */
+ /* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
if (!idle_cpu(cpu))
continue;
@@ -8471,7 +8800,7 @@ static int should_we_balance(struct lb_env *env)
balance_cpu = group_balance_cpu(sg);
/*
- * First idle cpu or the first cpu(busiest) in this sched group
+ * First idle CPU or the first CPU(busiest) in this sched group
* is eligible for doing load balancing at this and above domains.
*/
return balance_cpu == env->dst_cpu;
@@ -8580,7 +8909,7 @@ more_balance:
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
- * iterate on same src_cpu is dependent on number of cpus in our
+ * iterate on same src_cpu is dependent on number of CPUs in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
@@ -8597,7 +8926,7 @@ more_balance:
*/
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
- /* Prevent to re-select dst_cpu via env's cpus */
+ /* Prevent to re-select dst_cpu via env's CPUs */
cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
@@ -8659,9 +8988,10 @@ more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
- /* don't kick the active_load_balance_cpu_stop,
- * if the curr task on busiest cpu can't be
- * moved to this_cpu
+ /*
+ * Don't kick the active_load_balance_cpu_stop,
+ * if the curr task on busiest CPU can't be
+ * moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
raw_spin_unlock_irqrestore(&busiest->lock,
@@ -8773,121 +9103,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
}
/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
-{
- unsigned long next_balance = jiffies + HZ;
- int this_cpu = this_rq->cpu;
- struct sched_domain *sd;
- int pulled_task = 0;
- u64 curr_cost = 0;
-
- /*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
- */
- this_rq->idle_stamp = rq_clock(this_rq);
-
- /*
- * Do not pull tasks towards !active CPUs...
- */
- if (!cpu_active(this_cpu))
- return 0;
-
- /*
- * This is OK, because current is on_cpu, which avoids it being picked
- * for load-balance and preemption/IRQs are still disabled avoiding
- * further scheduler activity on it and we're being very careful to
- * re-start the picking loop.
- */
- rq_unpin_lock(this_rq, rf);
-
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (sd)
- update_next_balance(sd, &next_balance);
- rcu_read_unlock();
-
- goto out;
- }
-
- raw_spin_unlock(&this_rq->lock);
-
- update_blocked_averages(this_cpu);
- rcu_read_lock();
- for_each_domain(this_cpu, sd) {
- int continue_balancing = 1;
- u64 t0, domain_cost;
-
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, &next_balance);
- break;
- }
-
- if (sd->flags & SD_BALANCE_NEWIDLE) {
- t0 = sched_clock_cpu(this_cpu);
-
- pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE,
- &continue_balancing);
-
- domain_cost = sched_clock_cpu(this_cpu) - t0;
- if (domain_cost > sd->max_newidle_lb_cost)
- sd->max_newidle_lb_cost = domain_cost;
-
- curr_cost += domain_cost;
- }
-
- update_next_balance(sd, &next_balance);
-
- /*
- * Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
- */
- if (pulled_task || this_rq->nr_running > 0)
- break;
- }
- rcu_read_unlock();
-
- raw_spin_lock(&this_rq->lock);
-
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
-
- /*
- * While browsing the domains, we released the rq lock, a task could
- * have been enqueued in the meantime. Since we're not going idle,
- * pretend we pulled a task.
- */
- if (this_rq->cfs.h_nr_running && !pulled_task)
- pulled_task = 1;
-
-out:
- /* Move the next balance forward */
- if (time_after(this_rq->next_balance, next_balance))
- this_rq->next_balance = next_balance;
-
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
- pulled_task = -1;
-
- if (pulled_task)
- this_rq->idle_stamp = 0;
-
- rq_repin_lock(this_rq, rf);
-
- return pulled_task;
-}
-
-/*
- * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
* avoids physical / logical imbalances.
@@ -8911,7 +9127,7 @@ static int active_load_balance_cpu_stop(void *data)
if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
goto out_unlock;
- /* make sure the requested cpu hasn't gone down in the meantime */
+ /* Make sure the requested CPU hasn't gone down in the meantime: */
if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance))
goto out_unlock;
@@ -8923,7 +9139,7 @@ static int active_load_balance_cpu_stop(void *data)
/*
* This condition is "impossible", if it occurs
* we need to fix it. Originally reported by
- * Bjorn Helgaas on a 128-cpu setup.
+ * Bjorn Helgaas on a 128-CPU setup.
*/
BUG_ON(busiest_rq == target_rq);
@@ -8977,141 +9193,6 @@ out_unlock:
return 0;
}
-static inline int on_null_domain(struct rq *rq)
-{
- return unlikely(!rcu_dereference_sched(rq->sd));
-}
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
- * needed, they will kick the idle load balancer, which then does idle
- * load balancing for all the idle CPUs.
- */
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned;
-
-static inline int find_new_ilb(void)
-{
- int ilb = cpumask_first(nohz.idle_cpus_mask);
-
- if (ilb < nr_cpu_ids && idle_cpu(ilb))
- return ilb;
-
- return nr_cpu_ids;
-}
-
-/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
- * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
- * CPU (if there is one).
- */
-static void nohz_balancer_kick(void)
-{
- int ilb_cpu;
-
- nohz.next_balance++;
-
- ilb_cpu = find_new_ilb();
-
- if (ilb_cpu >= nr_cpu_ids)
- return;
-
- if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
- return;
- /*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target cpu which
- * is idle. And the softirq performing nohz idle load balance
- * will be run before returning from the IPI.
- */
- smp_send_reschedule(ilb_cpu);
- return;
-}
-
-void nohz_balance_exit_idle(unsigned int cpu)
-{
- if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
- /*
- * Completely isolated CPUs don't ever set, so we must test.
- */
- if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
- }
- clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
- }
-}
-
-static inline void set_cpu_sd_state_busy(void)
-{
- struct sched_domain *sd;
- int cpu = smp_processor_id();
-
- rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
-
- if (!sd || !sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 0;
-
- atomic_inc(&sd->shared->nr_busy_cpus);
-unlock:
- rcu_read_unlock();
-}
-
-void set_cpu_sd_state_idle(void)
-{
- struct sched_domain *sd;
- int cpu = smp_processor_id();
-
- rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
-
- if (!sd || sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 1;
-
- atomic_dec(&sd->shared->nr_busy_cpus);
-unlock:
- rcu_read_unlock();
-}
-
-/*
- * This routine will record that the cpu is going idle with tick stopped.
- * This info will be used in performing idle load balancing in the future.
- */
-void nohz_balance_enter_idle(int cpu)
-{
- /*
- * If this cpu is going down, then nothing needs to be done.
- */
- if (!cpu_active(cpu))
- return;
-
- /* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
- return;
-
- if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
- return;
-
- /*
- * If we're a completely isolated CPU, we don't play.
- */
- if (on_null_domain(cpu_rq(cpu)))
- return;
-
- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_inc(&nohz.nr_cpus);
- set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-}
-#endif
-
static DEFINE_SPINLOCK(balancing);
/*
@@ -9141,8 +9222,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
int need_serialize, need_decay = 0;
u64 max_cost = 0;
- update_blocked_averages(cpu);
-
rcu_read_lock();
for_each_domain(cpu, sd) {
/*
@@ -9232,68 +9311,56 @@ out:
}
}
+static inline int on_null_domain(struct rq *rq)
+{
+ return unlikely(!rcu_dereference_sched(rq->sd));
+}
+
#ifdef CONFIG_NO_HZ_COMMON
/*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * idle load balancing details
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * needed, they will kick the idle load balancer, which then does idle
+ * load balancing for all the idle CPUs.
*/
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
-{
- int this_cpu = this_rq->cpu;
- struct rq *rq;
- int balance_cpu;
- /* Earliest time when we have to do rebalance again */
- unsigned long next_balance = jiffies + 60*HZ;
- int update_next_balance = 0;
- if (idle != CPU_IDLE ||
- !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
- goto end;
+static inline int find_new_ilb(void)
+{
+ int ilb = cpumask_first(nohz.idle_cpus_mask);
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
- continue;
+ if (ilb < nr_cpu_ids && idle_cpu(ilb))
+ return ilb;
- /*
- * If this cpu gets work to do, stop the load balancing
- * work being done for other cpus. Next load
- * balancing owner will pick it up.
- */
- if (need_resched())
- break;
+ return nr_cpu_ids;
+}
- rq = cpu_rq(balance_cpu);
+/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void kick_ilb(unsigned int flags)
+{
+ int ilb_cpu;
- /*
- * If time for next balance is due,
- * do the balance.
- */
- if (time_after_eq(jiffies, rq->next_balance)) {
- struct rq_flags rf;
+ nohz.next_balance++;
- rq_lock_irq(rq, &rf);
- update_rq_clock(rq);
- cpu_load_update_idle(rq);
- rq_unlock_irq(rq, &rf);
+ ilb_cpu = find_new_ilb();
- rebalance_domains(rq, CPU_IDLE);
- }
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
- if (time_after(next_balance, rq->next_balance)) {
- next_balance = rq->next_balance;
- update_next_balance = 1;
- }
- }
+ flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
+ if (flags & NOHZ_KICK_MASK)
+ return;
/*
- * next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
- * updated.
+ * Use smp_send_reschedule() instead of resched_cpu().
+ * This way we generate a sched IPI on the target CPU which
+ * is idle. And the softirq performing nohz idle load balance
+ * will be run before returning from the IPI.
*/
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
-end:
- clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
+ smp_send_reschedule(ilb_cpu);
}
/*
@@ -9307,36 +9374,41 @@ end:
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline bool nohz_kick_needed(struct rq *rq)
+static void nohz_balancer_kick(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain_shared *sds;
struct sched_domain *sd;
int nr_busy, i, cpu = rq->cpu;
- bool kick = false;
+ unsigned int flags = 0;
if (unlikely(rq->idle_balance))
- return false;
+ return;
- /*
- * We may be recently in ticked or tickless idle mode. At the first
- * busy tick after returning from idle, we will update the busy stats.
- */
- set_cpu_sd_state_busy();
- nohz_balance_exit_idle(cpu);
+ /*
+ * We may be recently in ticked or tickless idle mode. At the first
+ * busy tick after returning from idle, we will update the busy stats.
+ */
+ nohz_balance_exit_idle(rq);
/*
* None are in tickless mode and hence no need for NOHZ idle load
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
- return false;
+ return;
+
+ if (READ_ONCE(nohz.has_blocked) &&
+ time_after(now, READ_ONCE(nohz.next_blocked)))
+ flags = NOHZ_STATS_KICK;
if (time_before(now, nohz.next_balance))
- return false;
+ goto out;
- if (rq->nr_running >= 2)
- return true;
+ if (rq->nr_running >= 2) {
+ flags = NOHZ_KICK_MASK;
+ goto out;
+ }
rcu_read_lock();
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
@@ -9347,7 +9419,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
- kick = true;
+ flags = NOHZ_KICK_MASK;
goto unlock;
}
@@ -9357,7 +9429,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
if (sd) {
if ((rq->cfs.h_nr_running >= 1) &&
check_cpu_capacity(rq, sd)) {
- kick = true;
+ flags = NOHZ_KICK_MASK;
goto unlock;
}
}
@@ -9370,18 +9442,421 @@ static inline bool nohz_kick_needed(struct rq *rq)
continue;
if (sched_asym_prefer(i, cpu)) {
- kick = true;
+ flags = NOHZ_KICK_MASK;
goto unlock;
}
}
}
unlock:
rcu_read_unlock();
- return kick;
+out:
+ if (flags)
+ kick_ilb(flags);
+}
+
+static void set_cpu_sd_state_busy(int cpu)
+{
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+
+ if (!sd || !sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 0;
+
+ atomic_inc(&sd->shared->nr_busy_cpus);
+unlock:
+ rcu_read_unlock();
+}
+
+void nohz_balance_exit_idle(struct rq *rq)
+{
+ SCHED_WARN_ON(rq != this_rq());
+
+ if (likely(!rq->nohz_tick_stopped))
+ return;
+
+ rq->nohz_tick_stopped = 0;
+ cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+
+ set_cpu_sd_state_busy(rq->cpu);
+}
+
+static void set_cpu_sd_state_idle(int cpu)
+{
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+
+ if (!sd || sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 1;
+
+ atomic_dec(&sd->shared->nr_busy_cpus);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * This routine will record that the CPU is going idle with tick stopped.
+ * This info will be used in performing idle load balancing in the future.
+ */
+void nohz_balance_enter_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ SCHED_WARN_ON(cpu != smp_processor_id());
+
+ /* If this CPU is going down, then nothing needs to be done: */
+ if (!cpu_active(cpu))
+ return;
+
+ /* Spare idle load balancing on CPUs that don't want to be disturbed: */
+ if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
+ return;
+
+ /*
+ * Can be set safely without rq->lock held
+ * If a clear happens, it will have evaluated last additions because
+ * rq->lock is held during the check and the clear
+ */
+ rq->has_blocked_load = 1;
+
+ /*
+ * The tick is still stopped but load could have been added in the
+ * meantime. We set the nohz.has_blocked flag to trig a check of the
+ * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
+ * of nohz.has_blocked can only happen after checking the new load
+ */
+ if (rq->nohz_tick_stopped)
+ goto out;
+
+ /* If we're a completely isolated CPU, we don't play: */
+ if (on_null_domain(rq))
+ return;
+
+ rq->nohz_tick_stopped = 1;
+
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_inc(&nohz.nr_cpus);
+
+ /*
+ * Ensures that if nohz_idle_balance() fails to observe our
+ * @idle_cpus_mask store, it must observe the @has_blocked
+ * store.
+ */
+ smp_mb__after_atomic();
+
+ set_cpu_sd_state_idle(cpu);
+
+out:
+ /*
+ * Each time a cpu enter idle, we assume that it has blocked load and
+ * enable the periodic update of the load of idle cpus
+ */
+ WRITE_ONCE(nohz.has_blocked, 1);
+}
+
+/*
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * The function returns false if the loop has stopped before running
+ * through all idle CPUs.
+ */
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+ enum cpu_idle_type idle)
+{
+ /* Earliest time when we have to do rebalance again */
+ unsigned long now = jiffies;
+ unsigned long next_balance = now + 60*HZ;
+ bool has_blocked_load = false;
+ int update_next_balance = 0;
+ int this_cpu = this_rq->cpu;
+ int balance_cpu;
+ int ret = false;
+ struct rq *rq;
+
+ SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
+
+ /*
+ * We assume there will be no idle load after this update and clear
+ * the has_blocked flag. If a cpu enters idle in the mean time, it will
+ * set the has_blocked flag and trig another update of idle load.
+ * Because a cpu that becomes idle, is added to idle_cpus_mask before
+ * setting the flag, we are sure to not clear the state and not
+ * check the load of an idle cpu.
+ */
+ WRITE_ONCE(nohz.has_blocked, 0);
+
+ /*
+ * Ensures that if we miss the CPU, we must see the has_blocked
+ * store from nohz_balance_enter_idle().
+ */
+ smp_mb();
+
+ for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+ continue;
+
+ /*
+ * If this CPU gets work to do, stop the load balancing
+ * work being done for other CPUs. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched()) {
+ has_blocked_load = true;
+ goto abort;
+ }
+
+ rq = cpu_rq(balance_cpu);
+
+ has_blocked_load |= update_nohz_stats(rq, true);
+
+ /*
+ * If time for next balance is due,
+ * do the balance.
+ */
+ if (time_after_eq(jiffies, rq->next_balance)) {
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ cpu_load_update_idle(rq);
+ rq_unlock_irqrestore(rq, &rf);
+
+ if (flags & NOHZ_BALANCE_KICK)
+ rebalance_domains(rq, CPU_IDLE);
+ }
+
+ if (time_after(next_balance, rq->next_balance)) {
+ next_balance = rq->next_balance;
+ update_next_balance = 1;
+ }
+ }
+
+ /* Newly idle CPU doesn't need an update */
+ if (idle != CPU_NEWLY_IDLE) {
+ update_blocked_averages(this_cpu);
+ has_blocked_load |= this_rq->has_blocked_load;
+ }
+
+ if (flags & NOHZ_BALANCE_KICK)
+ rebalance_domains(this_rq, CPU_IDLE);
+
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+ /* The full idle balance loop has been done */
+ ret = true;
+
+abort:
+ /* There is still blocked load, enable periodic update */
+ if (has_blocked_load)
+ WRITE_ONCE(nohz.has_blocked, 1);
+
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
+
+ return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ int this_cpu = this_rq->cpu;
+ unsigned int flags;
+
+ if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ return false;
+
+ if (idle != CPU_IDLE) {
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ return false;
+ }
+
+ /*
+ * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+ */
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ if (!(flags & NOHZ_KICK_MASK))
+ return false;
+
+ _nohz_idle_balance(this_rq, flags, idle);
+
+ return true;
+}
+
+static void nohz_newidle_balance(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu;
+
+ /*
+ * This CPU doesn't want to be disturbed by scheduler
+ * housekeeping
+ */
+ if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+ return;
+
+ /* Will wake up very soon. No time for doing anything else*/
+ if (this_rq->avg_idle < sysctl_sched_migration_cost)
+ return;
+
+ /* Don't need to update blocked load of idle CPUs*/
+ if (!READ_ONCE(nohz.has_blocked) ||
+ time_before(jiffies, READ_ONCE(nohz.next_blocked)))
+ return;
+
+ raw_spin_unlock(&this_rq->lock);
+ /*
+ * This CPU is going to be idle and blocked load of idle CPUs
+ * need to be updated. Run the ilb locally as it is a good
+ * candidate for ilb instead of waking up another idle CPU.
+ * Kick an normal ilb if we failed to do the update.
+ */
+ if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
+ kick_ilb(NOHZ_STATS_KICK);
+ raw_spin_lock(&this_rq->lock);
+}
+
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void nohz_balancer_kick(struct rq *rq) { }
+
+static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ return false;
+}
+
+static inline void nohz_newidle_balance(struct rq *this_rq) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+ unsigned long next_balance = jiffies + HZ;
+ int this_cpu = this_rq->cpu;
+ struct sched_domain *sd;
+ int pulled_task = 0;
+ u64 curr_cost = 0;
+
+ /*
+ * We must set idle_stamp _before_ calling idle_balance(), such that we
+ * measure the duration of idle_balance() as idle time.
+ */
+ this_rq->idle_stamp = rq_clock(this_rq);
+
+ /*
+ * Do not pull tasks towards !active CPUs...
+ */
+ if (!cpu_active(this_cpu))
+ return 0;
+
+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ rq_unpin_lock(this_rq, rf);
+
+ if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload) {
+
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (sd)
+ update_next_balance(sd, &next_balance);
+ rcu_read_unlock();
+
+ nohz_newidle_balance(this_rq);
+
+ goto out;
+ }
+
+ raw_spin_unlock(&this_rq->lock);
+
+ update_blocked_averages(this_cpu);
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ int continue_balancing = 1;
+ u64 t0, domain_cost;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+ update_next_balance(sd, &next_balance);
+ break;
+ }
+
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
+ t0 = sched_clock_cpu(this_cpu);
+
+ pulled_task = load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
+
+ domain_cost = sched_clock_cpu(this_cpu) - t0;
+ if (domain_cost > sd->max_newidle_lb_cost)
+ sd->max_newidle_lb_cost = domain_cost;
+
+ curr_cost += domain_cost;
+ }
+
+ update_next_balance(sd, &next_balance);
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || this_rq->nr_running > 0)
+ break;
+ }
+ rcu_read_unlock();
+
+ raw_spin_lock(&this_rq->lock);
+
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
+out:
+ /*
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
+ */
+ if (this_rq->cfs.h_nr_running && !pulled_task)
+ pulled_task = 1;
+
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
+ this_rq->next_balance = next_balance;
+
+ /* Is there a task of a high priority class? */
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ pulled_task = -1;
+
+ if (pulled_task)
+ this_rq->idle_stamp = 0;
+
+ rq_repin_lock(this_rq, rf);
+
+ return pulled_task;
}
-#else
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
-#endif
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
@@ -9394,14 +9869,18 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
CPU_IDLE : CPU_NOT_IDLE;
/*
- * If this cpu has a pending nohz_balance_kick, then do the
- * balancing on behalf of the other idle cpus whose ticks are
+ * If this CPU has a pending nohz_balance_kick, then do the
+ * balancing on behalf of the other idle CPUs whose ticks are
* stopped. Do nohz_idle_balance *before* rebalance_domains to
- * give the idle cpus a chance to load balance. Else we may
+ * give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
*/
- nohz_idle_balance(this_rq, idle);
+ if (nohz_idle_balance(this_rq, idle))
+ return;
+
+ /* normal load balance */
+ update_blocked_averages(this_rq->cpu);
rebalance_domains(this_rq, idle);
}
@@ -9416,10 +9895,8 @@ void trigger_load_balance(struct rq *rq)
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ_COMMON
- if (nohz_kick_needed(rq))
- nohz_balancer_kick();
-#endif
+
+ nohz_balancer_kick(rq);
}
static void rq_online_fair(struct rq *rq)
@@ -9440,7 +9917,12 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
/*
- * scheduler tick hitting a task of our scheduling class:
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
@@ -9591,7 +10073,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
- attach_entity_load_avg(cfs_rq, se);
+ attach_entity_load_avg(cfs_rq, se, 0);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
}
@@ -9993,6 +10475,7 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
+ nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9552fd5854bf..85ae8488039c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
SCHED_FEAT(WA_IDLE, true)
SCHED_FEAT(WA_WEIGHT, true)
SCHED_FEAT(WA_BIAS, true)
+
+/*
+ * UtilEstimation. Use estimated CPU utilization.
+ */
+SCHED_FEAT(UTIL_EST, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7dae9eb8c042..1a3e9bddd17b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,23 +1,14 @@
/*
- * Generic entry point for the idle threads
+ * Generic entry points for the idle threads and
+ * implementation of the idle task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE batch scheduled
+ * tasks which are handled in sched/fair.c )
*/
-#include <linux/sched.h>
-#include <linux/sched/idle.h>
-#include <linux/cpu.h>
-#include <linux/cpuidle.h>
-#include <linux/cpuhotplug.h>
-#include <linux/tick.h>
-#include <linux/mm.h>
-#include <linux/stackprotector.h>
-#include <linux/suspend.h>
-#include <linux/livepatch.h>
-
-#include <asm/tlb.h>
+#include "sched.h"
#include <trace/events/power.h>
-#include "sched.h"
-
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable)
static int __init cpu_idle_poll_setup(char *__unused)
{
cpu_idle_force_poll = 1;
+
return 1;
}
__setup("nohlt", cpu_idle_poll_setup);
@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup);
static int __init cpu_idle_nopoll_setup(char *__unused)
{
cpu_idle_force_poll = 0;
+
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
stop_critical_timings();
+
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
start_critical_timings();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
+
return 1;
}
@@ -146,13 +141,15 @@ static void cpuidle_idle_call(void)
}
/*
- * Tell the RCU framework we are entering an idle section,
- * so no more rcu read side critical sections and one more
+ * The RCU framework needs to be told that we are entering an idle
+ * section, so no more rcu read side critical sections and one more
* step to the grace period
*/
- rcu_idle_enter();
if (cpuidle_not_available(drv, dev)) {
+ tick_nohz_idle_stop_tick();
+ rcu_idle_enter();
+
default_idle_call();
goto exit_idle;
}
@@ -169,20 +166,37 @@ static void cpuidle_idle_call(void)
if (idle_should_enter_s2idle() || dev->use_deepest_state) {
if (idle_should_enter_s2idle()) {
+ rcu_idle_enter();
+
entered_state = cpuidle_enter_s2idle(drv, dev);
if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
}
+
+ rcu_idle_exit();
}
+ tick_nohz_idle_stop_tick();
+ rcu_idle_enter();
+
next_state = cpuidle_find_deepest_state(drv, dev);
call_cpuidle(drv, dev, next_state);
} else {
+ bool stop_tick = true;
+
/*
* Ask the cpuidle framework to choose a convenient idle state.
*/
- next_state = cpuidle_select(drv, dev);
+ next_state = cpuidle_select(drv, dev, &stop_tick);
+
+ if (stop_tick)
+ tick_nohz_idle_stop_tick();
+ else
+ tick_nohz_idle_retain_tick();
+
+ rcu_idle_enter();
+
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
@@ -227,6 +241,7 @@ static void do_idle(void)
rmb();
if (cpu_is_offline(cpu)) {
+ tick_nohz_idle_stop_tick_protected();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
@@ -240,10 +255,12 @@ static void do_idle(void)
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
*/
- if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ tick_nohz_idle_restart_tick();
cpu_idle_poll();
- else
+ } else {
cpuidle_idle_call();
+ }
arch_cpu_idle_exit();
}
@@ -332,8 +349,8 @@ void cpu_startup_entry(enum cpuhp_state state)
{
/*
* This #ifdef needs to die, but it's too late in the cycle to
- * make this generic (arm and sh have never invoked the canary
- * init for the non boot cpus!). Will be fixed in 3.11
+ * make this generic (ARM and SH have never invoked the canary
+ * init for the non boot CPUs!). Will be fixed in 3.11
*/
#ifdef CONFIG_X86
/*
@@ -350,3 +367,116 @@ void cpu_startup_entry(enum cpuhp_state state)
while (1)
do_idle();
}
+
+/*
+ * idle-task scheduling class.
+ */
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif
+
+/*
+ * Idle tasks are unconditionally rescheduled:
+ */
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+ resched_curr(rq);
+}
+
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ put_prev_task(rq, prev);
+ update_idle_core(rq);
+ schedstat_inc(rq->sched_goidle);
+
+ return rq->idle;
+}
+
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static void
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+ raw_spin_unlock_irq(&rq->lock);
+ printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+ dump_stack();
+ raw_spin_lock_irq(&rq->lock);
+}
+
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+{
+}
+
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
+{
+ BUG();
+}
+
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+{
+ BUG();
+}
+
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
+{
+ return 0;
+}
+
+static void update_curr_idle(struct rq *rq)
+{
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+const struct sched_class idle_sched_class = {
+ /* .next is NULL */
+ /* no enqueue/yield_task for idle tasks */
+
+ /* dequeue is not valid, we print a debug message there: */
+ .dequeue_task = dequeue_task_idle,
+
+ .check_preempt_curr = check_preempt_curr_idle,
+
+ .pick_next_task = pick_next_task_idle,
+ .put_prev_task = put_prev_task_idle,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_idle,
+ .set_cpus_allowed = set_cpus_allowed_common,
+#endif
+
+ .set_curr_task = set_curr_task_idle,
+ .task_tick = task_tick_idle,
+
+ .get_rr_interval = get_rr_interval_idle,
+
+ .prio_changed = prio_changed_idle,
+ .switched_to = switched_to_idle,
+ .update_curr = update_curr_idle,
+};
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index d518664cce4f..000000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "sched.h"
-
-/*
- * idle-task scheduling class.
- *
- * (NOTE: these are not related to SCHED_IDLE tasks which are
- * handled in sched/fair.c)
- */
-
-#ifdef CONFIG_SMP
-static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
-{
- return task_cpu(p); /* IDLE tasks as never migrated */
-}
-#endif /* CONFIG_SMP */
-
-/*
- * Idle tasks are unconditionally rescheduled:
- */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
-{
- resched_curr(rq);
-}
-
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
-{
- put_prev_task(rq, prev);
- update_idle_core(rq);
- schedstat_inc(rq->sched_goidle);
- return rq->idle;
-}
-
-/*
- * It is not legal to sleep in the idle task - print a warning
- * message if some code attempts to do it:
- */
-static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
-{
- raw_spin_unlock_irq(&rq->lock);
- printk(KERN_ERR "bad: scheduling from the idle thread!\n");
- dump_stack();
- raw_spin_lock_irq(&rq->lock);
-}
-
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
- rq_last_tick_reset(rq);
-}
-
-static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
-{
-}
-
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
-{
- BUG();
-}
-
-static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
-{
- BUG();
-}
-
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
-static void update_curr_idle(struct rq *rq)
-{
-}
-
-/*
- * Simple, special scheduling class for the per-CPU idle tasks:
- */
-const struct sched_class idle_sched_class = {
- /* .next is NULL */
- /* no enqueue/yield_task for idle tasks */
-
- /* dequeue is not valid, we print a debug message there: */
- .dequeue_task = dequeue_task_idle,
-
- .check_preempt_curr = check_preempt_curr_idle,
-
- .pick_next_task = pick_next_task_idle,
- .put_prev_task = put_prev_task_idle,
-
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_idle,
- .set_cpus_allowed = set_cpus_allowed_common,
-#endif
-
- .set_curr_task = set_curr_task_idle,
- .task_tick = task_tick_idle,
-
- .get_rr_interval = get_rr_interval_idle,
-
- .prio_changed = prio_changed_idle,
- .switched_to = switched_to_idle,
- .update_curr = update_curr_idle,
-};
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b71b436f59f2..e6802181900f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -3,15 +3,10 @@
* any CPU: unbound workqueues, timers, kthreads and any offloadable work.
*
* Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
*
*/
-
-#include <linux/sched/isolation.h>
-#include <linux/tick.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/static_key.h>
-#include <linux/ctype.h>
+#include "sched.h"
DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -60,6 +55,9 @@ void __init housekeeping_init(void)
static_branch_enable(&housekeeping_overriden);
+ if (housekeeping_flags & HK_FLAG_TICK)
+ sched_tick_offload_init();
+
/* We need at least one CPU to handle housekeeping work */
WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned int flags;
- flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+ flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
return housekeeping_setup(str, flags);
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 89a989e4d758..a171c1258109 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,10 +6,6 @@
* figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels.
*/
-
-#include <linux/export.h>
-#include <linux/sched/loadavg.h>
-
#include "sched.h"
/*
@@ -32,29 +28,29 @@
* Due to a number of reasons the above turns in the mess below:
*
* - for_each_possible_cpu() is prohibitively expensive on machines with
- * serious number of cpus, therefore we need to take a distributed approach
+ * serious number of CPUs, therefore we need to take a distributed approach
* to calculating nr_active.
*
* \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
* = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
*
* So assuming nr_active := 0 when we start out -- true per definition, we
- * can simply take per-cpu deltas and fold those into a global accumulate
+ * can simply take per-CPU deltas and fold those into a global accumulate
* to obtain the same result. See calc_load_fold_active().
*
- * Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ * Furthermore, in order to avoid synchronizing all per-CPU delta folding
* across the machine, we assume 10 ticks is sufficient time for every
- * cpu to have completed this task.
+ * CPU to have completed this task.
*
* This places an upper-bound on the IRQ-off latency of the machine. Then
* again, being late doesn't loose the delta, just wrecks the sample.
*
- * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- * this would add another cross-cpu cacheline miss and atomic operation
- * to the wakeup path. Instead we increment on whatever cpu the task ran
- * when it went into uninterruptible state and decrement on whatever cpu
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
+ * this would add another cross-CPU cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever CPU the task ran
+ * when it went into uninterruptible state and decrement on whatever CPU
* did the wakeup. This means that only the sum of nr_uninterruptible over
- * all cpus yields the correct result.
+ * all CPUs yields the correct result.
*
* This covers the NO_HZ=n code, for extra head-aches, see the comment below.
*/
@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* Handle NO_HZ for the global load-average.
*
* Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
+ * load-average relies on per-CPU sampling from the tick, it is affected by
* NO_HZ.
*
* The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * entering NO_HZ state such that we can include this as an 'extra' CPU delta
* when we read the global state.
*
* Obviously reality has to ruin such a delightfully simple scheme:
@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* busy state.
*
* This is solved by pushing the window forward, and thus skipping the
- * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which
+ * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
* was in effect at the time the window opened). This also solves the issue
- * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ
+ * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
* intervals.
*
* When making the ILB scale, we should try to pull this in as well.
@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp,
}
/*
- * NO_HZ can leave us missing all per-cpu ticks calling
+ * NO_HZ can leave us missing all per-CPU ticks calling
* calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
* calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
* in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks)
return;
/*
- * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus.
+ * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
*/
delta = calc_load_nohz_fold();
if (delta)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 5d0762633639..76e0eaf4654e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -13,32 +13,25 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-#include <linux/cpumask.h>
-#include <linux/atomic.h>
-
-#include "sched.h" /* for cpu_rq(). */
+#include "sched.h"
/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY.
*/
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
-#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
- (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
#else
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif
-#define MEMBARRIER_CMD_BITMASK \
- (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
- | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
- | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
- | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
+#define MEMBARRIER_CMD_BITMASK \
+ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
static void ipi_mb(void *info)
@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void)
*/
if (cpu == raw_smp_processor_id())
continue;
+
rcu_read_lock();
p = task_rcu_dereference(&cpu_rq(cpu)->curr);
if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags)
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
+
return 0;
}
@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void)
}
atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
&mm->membarrier_state);
+
return 0;
}
@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags)
synchronize_sched();
}
atomic_or(state, &mm->membarrier_state);
+
return 0;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aad49451584e..ef3c4e6f5345 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,12 +3,8 @@
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies)
*/
-
#include "sched.h"
-#include <linux/slab.h>
-#include <linux/irq_work.h>
-
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
static void push_rt_tasks(struct rq *);
static void pull_rt_task(struct rq *);
-static inline void queue_push_tasks(struct rq *rq)
+static inline void rt_queue_push_tasks(struct rq *rq)
{
if (!has_pushable_tasks(rq))
return;
@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq)
queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
}
-static inline void queue_pull_task(struct rq *rq)
+static inline void rt_queue_pull_task(struct rq *rq)
{
queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
}
@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq)
{
}
-static inline void queue_push_tasks(struct rq *rq)
+static inline void rt_queue_push_tasks(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
@@ -843,6 +839,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
continue;
raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+
if (rt_rq->rt_time) {
u64 runtime;
@@ -863,7 +861,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
* 'runtime'.
*/
if (rt_rq->rt_nr_running && rq->curr == rq->idle)
- rq_clock_skip_update(rq, false);
+ rq_clock_cancel_skipupdate(rq);
}
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
@@ -961,9 +959,6 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
- /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
-
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1005,6 +1000,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
sub_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 0;
+
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq, 0);
}
static void
@@ -1021,6 +1019,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
add_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 1;
+
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq, 0);
}
#if defined CONFIG_SMP
@@ -1453,9 +1454,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
return;
/*
- * There appears to be other cpus that can accept
- * current and none to run 'p', so lets reschedule
- * to try and push current away:
+ * There appear to be other CPUs that can accept
+ * the current task but none can run 'p', so lets reschedule
+ * to try and push the current task away:
*/
requeue_task_rt(rq, p, 1);
resched_curr(rq);
@@ -1569,7 +1570,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
- queue_push_tasks(rq);
+ rt_queue_push_tasks(rq);
return p;
}
@@ -1596,12 +1597,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1;
+
return 0;
}
/*
* Return the highest pushable rq's task, which is suitable to be executed
- * on the cpu, NULL otherwise
+ * on the CPU, NULL otherwise
*/
static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
{
@@ -1639,11 +1641,11 @@ static int find_lowest_rq(struct task_struct *task)
return -1; /* No targets found */
/*
- * At this point we have built a mask of cpus representing the
+ * At this point we have built a mask of CPUs representing the
* lowest priority tasks in the system. Now we want to elect
* the best one based on our affinity and topology.
*
- * We prioritize the last cpu that the task executed on since
+ * We prioritize the last CPU that the task executed on since
* it is most likely cache-hot in that location.
*/
if (cpumask_test_cpu(cpu, lowest_mask))
@@ -1651,7 +1653,7 @@ static int find_lowest_rq(struct task_struct *task)
/*
* Otherwise, we consult the sched_domains span maps to figure
- * out which cpu is logically closest to our hot cache data.
+ * out which CPU is logically closest to our hot cache data.
*/
if (!cpumask_test_cpu(this_cpu, lowest_mask))
this_cpu = -1; /* Skip this_cpu opt if not among lowest */
@@ -1692,6 +1694,7 @@ static int find_lowest_rq(struct task_struct *task)
cpu = cpumask_any(lowest_mask);
if (cpu < nr_cpu_ids)
return cpu;
+
return -1;
}
@@ -1827,7 +1830,7 @@ retry:
* The task hasn't migrated, and is still the next
* eligible task, but we failed to find a run-queue
* to push it to. Do not retry in this case, since
- * other cpus will pull from us when ready.
+ * other CPUs will pull from us when ready.
*/
goto out;
}
@@ -1919,7 +1922,7 @@ static int rto_next_cpu(struct root_domain *rd)
* rt_next_cpu() will simply return the first CPU found in
* the rto_mask.
*
- * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+ * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
* will return the next CPU found in the rto_mask.
*
* If there are no more CPUs left in the rto_mask, then a check is made
@@ -1980,7 +1983,7 @@ static void tell_cpu_to_push(struct rq *rq)
raw_spin_lock(&rq->rd->rto_lock);
/*
- * The rto_cpu is updated under the lock, if it has a valid cpu
+ * The rto_cpu is updated under the lock, if it has a valid CPU
* then the IPI is still running and will continue due to the
* update to loop_next, and nothing needs to be done here.
* Otherwise it is finishing up and an ipi needs to be sent.
@@ -2105,7 +2108,7 @@ static void pull_rt_task(struct rq *this_rq)
/*
* There's a chance that p is higher in priority
- * than what's currently running on its cpu.
+ * than what's currently running on its CPU.
* This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
@@ -2187,7 +2190,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return;
- queue_pull_task(rq);
+ rt_queue_pull_task(rq);
}
void __init init_sched_rt_class(void)
@@ -2218,7 +2221,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
- queue_push_tasks(rq);
+ rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
@@ -2242,7 +2245,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* may need to pull tasks to this runqueue.
*/
if (oldprio < p->prio)
- queue_pull_task(rq);
+ rt_queue_pull_task(rq);
/*
* If there's a higher priority task waiting to run
@@ -2292,6 +2295,14 @@ static void watchdog(struct rq *rq, struct task_struct *p)
static inline void watchdog(struct rq *rq, struct task_struct *p) { }
#endif
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
struct sched_rt_entity *rt_se = &p->rt;
@@ -2685,12 +2696,11 @@ int sched_rr_handler(struct ctl_table *table, int write,
msecs_to_jiffies(sysctl_sched_rr_timeslice);
}
mutex_unlock(&mutex);
+
return ret;
}
#ifdef CONFIG_SCHED_DEBUG
-extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-
void print_rt_stats(struct seq_file *m, int cpu)
{
rt_rq_iter_t iter;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb5fc458547f..6601baf2361c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,39 +1,73 @@
/* SPDX-License-Identifier: GPL-2.0 */
-
+/*
+ * Scheduler internal types and methods:
+ */
#include <linux/sched.h>
+
#include <linux/sched/autogroup.h>
-#include <linux/sched/sysctl.h>
-#include <linux/sched/topology.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/deadline.h>
#include <linux/sched/clock.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/numa_balancing.h>
-#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
#include <linux/sched/cpufreq.h>
-#include <linux/sched/stat.h>
-#include <linux/sched/nohz.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/deadline.h>
#include <linux/sched/debug.h>
#include <linux/sched/hotplug.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/init.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/jobctl.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/prio.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/sysctl.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
-#include <linux/sched/cputime.h>
-#include <linux/sched/init.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/user.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/xacct.h>
+
+#include <uapi/linux/sched/types.h>
-#include <linux/u64_stats_sync.h>
-#include <linux/kernel_stat.h>
#include <linux/binfmts.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
+#include <linux/context_tracking.h>
+#include <linux/cpufreq.h>
+#include <linux/cpuidle.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/kprobes.h>
+#include <linux/kthread.h>
+#include <linux/membarrier.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/nmi.h>
+#include <linux/proc_fs.h>
+#include <linux/prefetch.h>
+#include <linux/profile.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/security.h>
+#include <linux/stackprotector.h>
#include <linux/stop_machine.h>
-#include <linux/irq_work.h>
-#include <linux/tick.h>
-#include <linux/slab.h>
-#include <linux/cgroup.h>
+#include <linux/suspend.h>
+#include <linux/swait.h>
+#include <linux/syscalls.h>
+#include <linux/task_work.h>
+#include <linux/tsacct_kern.h>
+
+#include <asm/tlb.h>
#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
+# include <asm/paravirt.h>
#endif
#include "cpupri.h"
@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
* and does not change the user-interface for setting shares/weights.
*
* We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
- * pretty high and the returns do not justify the increased costs.
+ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
+ * are pretty high and the returns do not justify the increased costs.
*
- * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
- * increase coverage and consistency always enable it on 64bit platforms.
+ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
+ * increase coverage and consistency always enable it on 64-bit platforms.
*/
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
* 10 -> just above 1us
* 9 -> just above 0.5us
*/
-#define DL_SCALE (10)
+#define DL_SCALE 10
/*
- * These are the 'tuning knobs' of the scheduler:
+ * Single value that denotes runtime == period, ie unlimited time.
*/
-
-/*
- * single value that denotes runtime == period, ie unlimited time.
- */
-#define RUNTIME_INF ((u64)~0ULL)
+#define RUNTIME_INF ((u64)~0ULL)
static inline int idle_policy(int policy)
{
@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p);
* control.
*/
struct dl_bandwidth {
- raw_spinlock_t dl_runtime_lock;
- u64 dl_runtime;
- u64 dl_period;
+ raw_spinlock_t dl_runtime_lock;
+ u64 dl_runtime;
+ u64 dl_period;
};
static inline int dl_bandwidth_enabled(void)
@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void)
}
struct dl_bw {
- raw_spinlock_t lock;
- u64 bw, total_bw;
+ raw_spinlock_t lock;
+ u64 bw;
+ u64 total_bw;
};
static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
}
-void dl_change_utilization(struct task_struct *p, u64 new_bw);
+extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
extern void init_dl_bw(struct dl_bw *dl_b);
-extern int sched_dl_global_validate(void);
+extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
-extern int sched_dl_overflow(struct task_struct *p, int policy,
- const struct sched_attr *attr);
+extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
-extern int dl_task_can_attach(struct task_struct *p,
- const struct cpumask *cs_cpus_allowed);
-extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
- const struct cpumask *trial);
+extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
+extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern bool dl_cpu_busy(unsigned int cpu);
#ifdef CONFIG_CGROUP_SCHED
@@ -300,32 +328,36 @@ extern struct list_head task_groups;
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
- raw_spinlock_t lock;
- ktime_t period;
- u64 quota, runtime;
- s64 hierarchical_quota;
- u64 runtime_expires;
-
- int idle, period_active;
- struct hrtimer period_timer, slack_timer;
- struct list_head throttled_cfs_rq;
-
- /* statistics */
- int nr_periods, nr_throttled;
- u64 throttled_time;
+ raw_spinlock_t lock;
+ ktime_t period;
+ u64 quota;
+ u64 runtime;
+ s64 hierarchical_quota;
+ u64 runtime_expires;
+
+ int idle;
+ int period_active;
+ struct hrtimer period_timer;
+ struct hrtimer slack_timer;
+ struct list_head throttled_cfs_rq;
+
+ /* Statistics: */
+ int nr_periods;
+ int nr_throttled;
+ u64 throttled_time;
#endif
};
-/* task group related information */
+/* Task group related information */
struct task_group {
struct cgroup_subsys_state css;
#ifdef CONFIG_FAIR_GROUP_SCHED
- /* schedulable entities of this group on each cpu */
- struct sched_entity **se;
- /* runqueue "owned" by this group on each cpu */
- struct cfs_rq **cfs_rq;
- unsigned long shares;
+ /* schedulable entities of this group on each CPU */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each CPU */
+ struct cfs_rq **cfs_rq;
+ unsigned long shares;
#ifdef CONFIG_SMP
/*
@@ -333,29 +365,29 @@ struct task_group {
* it in its own cacheline separated from the fields above which
* will also be accessed at each tick.
*/
- atomic_long_t load_avg ____cacheline_aligned;
+ atomic_long_t load_avg ____cacheline_aligned;
#endif
#endif
#ifdef CONFIG_RT_GROUP_SCHED
- struct sched_rt_entity **rt_se;
- struct rt_rq **rt_rq;
+ struct sched_rt_entity **rt_se;
+ struct rt_rq **rt_rq;
- struct rt_bandwidth rt_bandwidth;
+ struct rt_bandwidth rt_bandwidth;
#endif
- struct rcu_head rcu;
- struct list_head list;
+ struct rcu_head rcu;
+ struct list_head list;
- struct task_group *parent;
- struct list_head siblings;
- struct list_head children;
+ struct task_group *parent;
+ struct list_head siblings;
+ struct list_head children;
#ifdef CONFIG_SCHED_AUTOGROUP
- struct autogroup *autogroup;
+ struct autogroup *autogroup;
#endif
- struct cfs_bandwidth cfs_bandwidth;
+ struct cfs_bandwidth cfs_bandwidth;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -369,8 +401,8 @@ struct task_group {
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
-#define MIN_SHARES (1UL << 1)
-#define MAX_SHARES (1UL << 18)
+#define MIN_SHARES (1UL << 1)
+#define MAX_SHARES (1UL << 18)
#endif
typedef int (*tg_visitor)(struct task_group *, void *);
@@ -443,35 +475,39 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
- struct load_weight load;
- unsigned long runnable_weight;
- unsigned int nr_running, h_nr_running;
+ struct load_weight load;
+ unsigned long runnable_weight;
+ unsigned int nr_running;
+ unsigned int h_nr_running;
- u64 exec_clock;
- u64 min_vruntime;
+ u64 exec_clock;
+ u64 min_vruntime;
#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
+ u64 min_vruntime_copy;
#endif
- struct rb_root_cached tasks_timeline;
+ struct rb_root_cached tasks_timeline;
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next, *last, *skip;
+ struct sched_entity *curr;
+ struct sched_entity *next;
+ struct sched_entity *last;
+ struct sched_entity *skip;
#ifdef CONFIG_SCHED_DEBUG
- unsigned int nr_spread_over;
+ unsigned int nr_spread_over;
#endif
#ifdef CONFIG_SMP
/*
* CFS load tracking
*/
- struct sched_avg avg;
+ struct sched_avg avg;
#ifndef CONFIG_64BIT
- u64 load_last_update_time_copy;
+ u64 load_last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
@@ -482,9 +518,9 @@ struct cfs_rq {
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
- unsigned long tg_load_avg_contrib;
- long propagate;
- long prop_runnable_sum;
+ unsigned long tg_load_avg_contrib;
+ long propagate;
+ long prop_runnable_sum;
/*
* h_load = weight * f(tg)
@@ -492,36 +528,38 @@ struct cfs_rq {
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
- unsigned long h_load;
- u64 last_h_load_update;
- struct sched_entity *h_load_next;
+ unsigned long h_load;
+ u64 last_h_load_update;
+ struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
- struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
+ struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
- * list is used during load balance.
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
+ * This list is used during load balance.
*/
- int on_list;
- struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_CFS_BANDWIDTH
- int runtime_enabled;
- u64 runtime_expires;
- s64 runtime_remaining;
-
- u64 throttled_clock, throttled_clock_task;
- u64 throttled_clock_task_time;
- int throttled, throttle_count;
- struct list_head throttled_list;
+ int runtime_enabled;
+ u64 runtime_expires;
+ s64 runtime_remaining;
+
+ u64 throttled_clock;
+ u64 throttled_clock_task;
+ u64 throttled_clock_task_time;
+ int throttled;
+ int throttle_count;
+ struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void)
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
- struct rt_prio_array active;
- unsigned int rt_nr_running;
- unsigned int rr_nr_running;
+ struct rt_prio_array active;
+ unsigned int rt_nr_running;
+ unsigned int rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
- int curr; /* highest queued rt task prio */
+ int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
- int next; /* next highest */
+ int next; /* next highest */
#endif
} highest_prio;
#endif
#ifdef CONFIG_SMP
- unsigned long rt_nr_migratory;
- unsigned long rt_nr_total;
- int overloaded;
- struct plist_head pushable_tasks;
+ unsigned long rt_nr_migratory;
+ unsigned long rt_nr_total;
+ int overloaded;
+ struct plist_head pushable_tasks;
#endif /* CONFIG_SMP */
- int rt_queued;
+ int rt_queued;
- int rt_throttled;
- u64 rt_time;
- u64 rt_runtime;
+ int rt_throttled;
+ u64 rt_time;
+ u64 rt_runtime;
/* Nests inside the rq lock: */
- raw_spinlock_t rt_runtime_lock;
+ raw_spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
- unsigned long rt_nr_boosted;
+ unsigned long rt_nr_boosted;
- struct rq *rq;
- struct task_group *tg;
+ struct rq *rq;
+ struct task_group *tg;
#endif
};
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
- struct rb_root_cached root;
+ struct rb_root_cached root;
- unsigned long dl_nr_running;
+ unsigned long dl_nr_running;
#ifdef CONFIG_SMP
/*
@@ -586,28 +624,28 @@ struct dl_rq {
* should migrate somewhere else.
*/
struct {
- u64 curr;
- u64 next;
+ u64 curr;
+ u64 next;
} earliest_dl;
- unsigned long dl_nr_migratory;
- int overloaded;
+ unsigned long dl_nr_migratory;
+ int overloaded;
/*
* Tasks on this rq that can be pushed away. They are kept in
* an rb-tree, ordered by tasks' deadlines, with caching
* of the leftmost (earliest deadline) element.
*/
- struct rb_root_cached pushable_dl_tasks_root;
+ struct rb_root_cached pushable_dl_tasks_root;
#else
- struct dl_bw dl_bw;
+ struct dl_bw dl_bw;
#endif
/*
* "Active utilization" for this runqueue: increased when a
* task wakes up (becomes TASK_RUNNING) and decreased when a
* task blocks
*/
- u64 running_bw;
+ u64 running_bw;
/*
* Utilization of the tasks "assigned" to this runqueue (including
@@ -618,14 +656,14 @@ struct dl_rq {
* This is needed to compute the "inactive utilization" for the
* runqueue (inactive utilization = this_bw - running_bw).
*/
- u64 this_bw;
- u64 extra_bw;
+ u64 this_bw;
+ u64 extra_bw;
/*
* Inverse of the fraction of CPU utilization that can be reclaimed
* by the GRUB algorithm.
*/
- u64 bw_ratio;
+ u64 bw_ratio;
};
#ifdef CONFIG_SMP
@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b)
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * fully partitioning the member CPUs from any other cpuset. Whenever a new
* exclusive cpuset is created, we also create and attach a new root-domain
* object.
*
*/
struct root_domain {
- atomic_t refcount;
- atomic_t rto_count;
- struct rcu_head rcu;
- cpumask_var_t span;
- cpumask_var_t online;
+ atomic_t refcount;
+ atomic_t rto_count;
+ struct rcu_head rcu;
+ cpumask_var_t span;
+ cpumask_var_t online;
/* Indicate more than one runnable task for any CPU */
- bool overload;
+ bool overload;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
*/
- cpumask_var_t dlo_mask;
- atomic_t dlo_count;
- struct dl_bw dl_bw;
- struct cpudl cpudl;
+ cpumask_var_t dlo_mask;
+ atomic_t dlo_count;
+ struct dl_bw dl_bw;
+ struct cpudl cpudl;
#ifdef HAVE_RT_PUSH_IPI
/*
* For IPI pull requests, loop across the rto_mask.
*/
- struct irq_work rto_push_work;
- raw_spinlock_t rto_lock;
+ struct irq_work rto_push_work;
+ raw_spinlock_t rto_lock;
/* These are only updated and read within rto_lock */
- int rto_loop;
- int rto_cpu;
+ int rto_loop;
+ int rto_cpu;
/* These atomics are updated outside of a lock */
- atomic_t rto_loop_next;
- atomic_t rto_loop_start;
+ atomic_t rto_loop_next;
+ atomic_t rto_loop_start;
#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
- cpumask_var_t rto_mask;
- struct cpupri cpupri;
+ cpumask_var_t rto_mask;
+ struct cpupri cpupri;
- unsigned long max_cpu_capacity;
+ unsigned long max_cpu_capacity;
};
extern struct root_domain def_root_domain;
@@ -708,41 +746,42 @@ extern void rto_push_irq_work_func(struct irq_work *work);
*/
struct rq {
/* runqueue lock: */
- raw_spinlock_t lock;
+ raw_spinlock_t lock;
/*
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
- unsigned int nr_running;
+ unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
- unsigned int nr_numa_running;
- unsigned int nr_preferred_running;
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
#endif
#define CPU_LOAD_IDX_MAX 5
- unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
- unsigned long last_load_update_tick;
+ unsigned long last_load_update_tick;
+ unsigned long last_blocked_load_update_tick;
+ unsigned int has_blocked_load;
#endif /* CONFIG_SMP */
- unsigned long nohz_flags;
+ unsigned int nohz_tick_stopped;
+ atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
-#ifdef CONFIG_NO_HZ_FULL
- unsigned long last_sched_tick;
-#endif
- /* capture load from *all* tasks on this cpu: */
- struct load_weight load;
- unsigned long nr_load_updates;
- u64 nr_switches;
- struct cfs_rq cfs;
- struct rt_rq rt;
- struct dl_rq dl;
+ /* capture load from *all* tasks on this CPU: */
+ struct load_weight load;
+ unsigned long nr_load_updates;
+ u64 nr_switches;
+
+ struct cfs_rq cfs;
+ struct rt_rq rt;
+ struct dl_rq dl;
#ifdef CONFIG_FAIR_GROUP_SCHED
- /* list of leaf cfs_rq on this cpu: */
- struct list_head leaf_cfs_rq_list;
- struct list_head *tmp_alone_branch;
+ /* list of leaf cfs_rq on this CPU: */
+ struct list_head leaf_cfs_rq_list;
+ struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -751,94 +790,98 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned long nr_uninterruptible;
+ unsigned long nr_uninterruptible;
- struct task_struct *curr, *idle, *stop;
- unsigned long next_balance;
- struct mm_struct *prev_mm;
+ struct task_struct *curr;
+ struct task_struct *idle;
+ struct task_struct *stop;
+ unsigned long next_balance;
+ struct mm_struct *prev_mm;
- unsigned int clock_update_flags;
- u64 clock;
- u64 clock_task;
+ unsigned int clock_update_flags;
+ u64 clock;
+ u64 clock_task;
- atomic_t nr_iowait;
+ atomic_t nr_iowait;
#ifdef CONFIG_SMP
- struct root_domain *rd;
- struct sched_domain *sd;
+ struct root_domain *rd;
+ struct sched_domain *sd;
+
+ unsigned long cpu_capacity;
+ unsigned long cpu_capacity_orig;
- unsigned long cpu_capacity;
- unsigned long cpu_capacity_orig;
+ struct callback_head *balance_callback;
- struct callback_head *balance_callback;
+ unsigned char idle_balance;
- unsigned char idle_balance;
/* For active balancing */
- int active_balance;
- int push_cpu;
- struct cpu_stop_work active_balance_work;
- /* cpu of this runqueue: */
- int cpu;
- int online;
+ int active_balance;
+ int push_cpu;
+ struct cpu_stop_work active_balance_work;
+
+ /* CPU of this runqueue: */
+ int cpu;
+ int online;
struct list_head cfs_tasks;
- u64 rt_avg;
- u64 age_stamp;
- u64 idle_stamp;
- u64 avg_idle;
+ u64 rt_avg;
+ u64 age_stamp;
+ u64 idle_stamp;
+ u64 avg_idle;
/* This is used to determine avg_idle's max value */
- u64 max_idle_balance_cost;
+ u64 max_idle_balance_cost;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- u64 prev_irq_time;
+ u64 prev_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
- u64 prev_steal_time;
+ u64 prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- u64 prev_steal_time_rq;
+ u64 prev_steal_time_rq;
#endif
/* calc_load related fields */
- unsigned long calc_load_update;
- long calc_load_active;
+ unsigned long calc_load_update;
+ long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
- int hrtick_csd_pending;
- call_single_data_t hrtick_csd;
+ int hrtick_csd_pending;
+ call_single_data_t hrtick_csd;
#endif
- struct hrtimer hrtick_timer;
+ struct hrtimer hrtick_timer;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
- struct sched_info rq_sched_info;
- unsigned long long rq_cpu_time;
+ struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
- unsigned int yld_count;
+ unsigned int yld_count;
/* schedule() stats */
- unsigned int sched_count;
- unsigned int sched_goidle;
+ unsigned int sched_count;
+ unsigned int sched_goidle;
/* try_to_wake_up() stats */
- unsigned int ttwu_count;
- unsigned int ttwu_local;
+ unsigned int ttwu_count;
+ unsigned int ttwu_local;
#endif
#ifdef CONFIG_SMP
- struct llist_head wake_list;
+ struct llist_head wake_list;
#endif
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
- struct cpuidle_state *idle_state;
+ struct cpuidle_state *idle_state;
#endif
};
@@ -904,9 +947,9 @@ static inline u64 __rq_clock_broken(struct rq *rq)
* one position though, because the next rq_unpin_lock() will shift it
* back.
*/
-#define RQCF_REQ_SKIP 0x01
-#define RQCF_ACT_SKIP 0x02
-#define RQCF_UPDATED 0x04
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+#define RQCF_UPDATED 0x04
static inline void assert_clock_updated(struct rq *rq)
{
@@ -933,13 +976,20 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
-static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+static inline void rq_clock_skip_update(struct rq *rq)
{
lockdep_assert_held(&rq->lock);
- if (skip)
- rq->clock_update_flags |= RQCF_REQ_SKIP;
- else
- rq->clock_update_flags &= ~RQCF_REQ_SKIP;
+ rq->clock_update_flags |= RQCF_REQ_SKIP;
+}
+
+/*
+ * See rt task throttling, which is the only time a skip
+ * request is cancelled.
+ */
+static inline void rq_clock_cancel_skipupdate(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
struct rq_flags {
@@ -1019,6 +1069,12 @@ enum numa_faults_stats {
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
+extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+#else
+static inline void
+init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SMP
@@ -1059,12 +1115,12 @@ extern void sched_ttwu_pending(void);
/**
* highest_flag_domain - Return highest sched_domain containing flag.
- * @cpu: The cpu whose highest level of sched domain is to
+ * @cpu: The CPU whose highest level of sched domain is to
* be returned.
* @flag: The flag to check for the highest sched_domain
- * for the given cpu.
+ * for the given CPU.
*
- * Returns the highest sched_domain of a cpu which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains the given flag.
*/
static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
{
@@ -1099,30 +1155,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_capacity {
- atomic_t ref;
+ atomic_t ref;
/*
* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
- unsigned long capacity;
- unsigned long min_capacity; /* Min per-CPU capacity in group */
- unsigned long next_update;
- int imbalance; /* XXX unrelated to capacity but shared group state */
+ unsigned long capacity;
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
+ unsigned long next_update;
+ int imbalance; /* XXX unrelated to capacity but shared group state */
#ifdef CONFIG_SCHED_DEBUG
- int id;
+ int id;
#endif
- unsigned long cpumask[0]; /* balance mask */
+ unsigned long cpumask[0]; /* Balance mask */
};
struct sched_group {
- struct sched_group *next; /* Must be a circular list */
- atomic_t ref;
+ struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
- unsigned int group_weight;
+ unsigned int group_weight;
struct sched_group_capacity *sgc;
- int asym_prefer_cpu; /* cpu of highest priority in group */
+ int asym_prefer_cpu; /* CPU of highest priority in group */
/*
* The CPUs this group covers.
@@ -1131,7 +1187,7 @@ struct sched_group {
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
- unsigned long cpumask[0];
+ unsigned long cpumask[0];
};
static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1148,8 +1204,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
}
/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
+ * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
+ * @group: The group whose first CPU is to be returned.
*/
static inline unsigned int group_first_cpu(struct sched_group *group)
{
@@ -1349,19 +1405,12 @@ static inline int task_on_rq_migrating(struct task_struct *p)
return p->on_rq == TASK_ON_RQ_MIGRATING;
}
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next) do { } while (0)
-#endif
-#ifndef finish_arch_post_lock_switch
-# define finish_arch_post_lock_switch() do { } while (0)
-#endif
-
/*
* wake flags
*/
-#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
-#define WF_FORK 0x02 /* child wakeup after fork */
-#define WF_MIGRATED 0x4 /* internal use, task got migrated */
+#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
+#define WF_FORK 0x02 /* Child wakeup after fork */
+#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1372,11 +1421,11 @@ static inline int task_on_rq_migrating(struct task_struct *p)
* slice expiry etc.
*/
-#define WEIGHT_IDLEPRIO 3
-#define WMULT_IDLEPRIO 1431655765
+#define WEIGHT_IDLEPRIO 3
+#define WMULT_IDLEPRIO 1431655765
-extern const int sched_prio_to_weight[40];
-extern const u32 sched_prio_to_wmult[40];
+extern const int sched_prio_to_weight[40];
+extern const u32 sched_prio_to_wmult[40];
/*
* {de,en}queue flags:
@@ -1398,9 +1447,9 @@ extern const u32 sched_prio_to_wmult[40];
*/
#define DEQUEUE_SLEEP 0x01
-#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
-#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
-#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -1422,10 +1471,10 @@ struct sched_class {
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
- void (*yield_task) (struct rq *rq);
- bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+ void (*yield_task) (struct rq *rq);
+ bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
- void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+ void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
/*
* It is the responsibility of the pick_next_task() method that will
@@ -1435,16 +1484,16 @@ struct sched_class {
* May return RETRY_TASK when it finds a higher prio class has runnable
* tasks.
*/
- struct task_struct * (*pick_next_task) (struct rq *rq,
- struct task_struct *prev,
- struct rq_flags *rf);
- void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+ struct task_struct * (*pick_next_task)(struct rq *rq,
+ struct task_struct *prev,
+ struct rq_flags *rf);
+ void (*put_prev_task)(struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p);
- void (*task_woken) (struct rq *this_rq, struct task_struct *task);
+ void (*task_woken)(struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
@@ -1453,31 +1502,31 @@ struct sched_class {
void (*rq_offline)(struct rq *rq);
#endif
- void (*set_curr_task) (struct rq *rq);
- void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
- void (*task_fork) (struct task_struct *p);
- void (*task_dead) (struct task_struct *p);
+ void (*set_curr_task)(struct rq *rq);
+ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+ void (*task_fork)(struct task_struct *p);
+ void (*task_dead)(struct task_struct *p);
/*
* The switched_from() call is allowed to drop rq->lock, therefore we
* cannot assume the switched_from/switched_to pair is serliazed by
* rq->lock. They are however serialized by p->pi_lock.
*/
- void (*switched_from) (struct rq *this_rq, struct task_struct *task);
- void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_from)(struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
- int oldprio);
+ int oldprio);
- unsigned int (*get_rr_interval) (struct rq *rq,
- struct task_struct *task);
+ unsigned int (*get_rr_interval)(struct rq *rq,
+ struct task_struct *task);
- void (*update_curr) (struct rq *rq);
+ void (*update_curr)(struct rq *rq);
-#define TASK_SET_GROUP 0
-#define TASK_MOVE_GROUP 1
+#define TASK_SET_GROUP 0
+#define TASK_MOVE_GROUP 1
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_change_group) (struct task_struct *p, int type);
+ void (*task_change_group)(struct task_struct *p, int type);
#endif
};
@@ -1526,6 +1575,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
SCHED_WARN_ON(!rcu_read_lock_held());
+
return rq->idle_state;
}
#else
@@ -1564,9 +1614,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
-#define BW_SHIFT 20
-#define BW_UNIT (1 << BW_SHIFT)
-#define RATIO_SHIFT 8
+#define BW_SHIFT 20
+#define BW_UNIT (1 << BW_SHIFT)
+#define RATIO_SHIFT 8
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1574,6 +1624,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
#ifdef CONFIG_NO_HZ_FULL
extern bool sched_can_stop_tick(struct rq *rq);
+extern int __init sched_tick_offload_init(void);
/*
* Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1598,6 +1649,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
#else
+static inline int sched_tick_offload_init(void) { return 0; }
static inline void sched_update_tick_dependency(struct rq *rq) { }
#endif
@@ -1624,13 +1676,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
sched_update_tick_dependency(rq);
}
-static inline void rq_last_tick_reset(struct rq *rq)
-{
-#ifdef CONFIG_NO_HZ_FULL
- rq->last_sched_tick = jiffies;
-#endif
-}
-
extern void update_rq_clock(struct rq *rq);
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1821,8 +1866,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
/*
* Unfair double_lock_balance: Optimizes throughput at the expense of
* latency by eliminating extra atomic operations when the locks are
- * already in proper order on entry. This favors lower cpu-ids and will
- * grant the double lock to lower cpus over higher ids under contention,
+ * already in proper order on entry. This favors lower CPU-ids and will
+ * grant the double lock to lower CPUs over higher ids under contention,
* regardless of entry order into the function.
*/
static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
@@ -1854,7 +1899,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
{
if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work good under rq->lock */
+ /* printk() doesn't work well under rq->lock */
raw_spin_unlock(&this_rq->lock);
BUG_ON(1);
}
@@ -1986,8 +2031,9 @@ extern bool sched_debug_enabled;
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
-extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
#ifdef CONFIG_NUMA_BALANCING
extern void
show_numa_stats(struct task_struct *p, struct seq_file *m);
@@ -2005,16 +2051,19 @@ extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
-enum rq_nohz_flag_bits {
- NOHZ_TICK_STOPPED,
- NOHZ_BALANCE_KICK,
-};
+#define NOHZ_BALANCE_KICK_BIT 0
+#define NOHZ_STATS_KICK_BIT 1
+
+#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
-extern void nohz_balance_exit_idle(unsigned int cpu);
+extern void nohz_balance_exit_idle(struct rq *rq);
#else
-static inline void nohz_balance_exit_idle(unsigned int cpu) { }
+static inline void nohz_balance_exit_idle(struct rq *rq) { }
#endif
@@ -2113,15 +2162,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
-#ifndef arch_scale_freq_invariant
-#define arch_scale_freq_invariant() (true)
-#endif
-#else /* arch_scale_freq_capacity */
-#define arch_scale_freq_invariant() (false)
+# ifndef arch_scale_freq_invariant
+# define arch_scale_freq_invariant() true
+# endif
+#else
+# define arch_scale_freq_invariant() false
#endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
-
static inline unsigned long cpu_util_dl(struct rq *rq)
{
return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2129,7 +2177,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
static inline unsigned long cpu_util_cfs(struct rq *rq)
{
- return rq->cfs.avg.util_avg;
-}
+ unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
+
+ if (sched_feat(UTIL_EST)) {
+ util = max_t(unsigned long, util,
+ READ_ONCE(rq->cfs.avg.util_est.enqueued));
+ }
+ return util;
+}
#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 940b1fa1d2ce..750fb3c67eed 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -1,14 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-
+/*
+ * /proc/schedstat implementation
+ */
#include "sched.h"
/*
- * bump this up when changing the output format or the meaning of an existing
+ * Current schedstat API version.
+ *
+ * Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
#define SCHEDSTAT_VERSION 15
@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
* This itererator needs some explanation.
* It returns 1 for the header position.
* This means 2 is cpu 0.
- * In a hotplugged system some cpus, including cpu 0, may be missing so we have
- * to use cpumask_* to iterate over the cpus.
+ * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the CPUs.
*/
static void *schedstat_start(struct seq_file *file, loff_t *offset)
{
@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
if (n < nr_cpu_ids)
return (void *)(unsigned long)(n + 2);
+
return NULL;
}
static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
{
(*offset)++;
+
return schedstat_start(file, offset);
}
@@ -119,21 +120,9 @@ static const struct seq_operations schedstat_sops = {
.show = show_schedstat,
};
-static int schedstat_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &schedstat_sops);
-}
-
-static const struct file_operations proc_schedstat_operations = {
- .open = schedstat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init proc_schedstat_init(void)
{
- proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+ proc_create_seq("schedstat", 0, NULL, &schedstat_sops);
return 0;
}
subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8e7b58de61e7..8aea199a39b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.run_delay += delta;
}
-#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
#define __schedstat_inc(var) do { var++; } while (0)
-#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
+#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
#define __schedstat_add(var, amt) do { var += (amt); } while (0)
-#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
-#define __schedstat_set(var, val) do { var = (val); } while (0)
-#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
-#define schedstat_val(var) (var)
-#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
-
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{}
-#define schedstat_enabled() 0
-#define __schedstat_inc(var) do { } while (0)
-#define schedstat_inc(var) do { } while (0)
-#define __schedstat_add(var, amt) do { } while (0)
-#define schedstat_add(var, amt) do { } while (0)
-#define __schedstat_set(var, val) do { } while (0)
-#define schedstat_set(var, val) do { } while (0)
-#define schedstat_val(var) 0
-#define schedstat_val_or_zero(var) 0
+#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define __schedstat_set(var, val) do { var = (val); } while (0)
+#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_val(var) (var)
+#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
+
+#else /* !CONFIG_SCHEDSTATS: */
+static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
+# define schedstat_enabled() 0
+# define __schedstat_inc(var) do { } while (0)
+# define schedstat_inc(var) do { } while (0)
+# define __schedstat_add(var, amt) do { } while (0)
+# define schedstat_add(var, amt) do { } while (0)
+# define __schedstat_set(var, val) do { } while (0)
+# define schedstat_set(var, val) do { } while (0)
+# define schedstat_val(var) 0
+# define schedstat_val_or_zero(var) 0
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SCHED_INFO
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
/*
* We are interested in knowing how long it was from the *first* time a
- * task was queued to the time that it finally hit a cpu, we call this routine
- * from dequeue_task() to account for possible rq->clock skew across cpus. The
- * delta taken on each cpu would annul the skew.
+ * task was queued to the time that it finally hit a CPU, we call this routine
+ * from dequeue_task() to account for possible rq->clock skew across CPUs. The
+ * delta taken on each CPU would annul the skew.
*/
static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
}
/*
- * Called when a task finally hits the cpu. We can now calculate how
+ * Called when a task finally hits the CPU. We can now calculate how
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
*/
static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
- if (unlikely(sched_info_on()))
+ if (unlikely(sched_info_on())) {
if (!t->sched_info.last_queued)
t->sched_info.last_queued = rq_clock(rq);
+ }
}
/*
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
*/
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
{
- unsigned long long delta = rq_clock(rq) -
- t->sched_info.last_arrival;
+ unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
rq_sched_info_depart(rq, delta);
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
-__sched_info_switch(struct rq *rq,
- struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
/*
- * prev now departs the cpu. It's not interesting to record
+ * prev now departs the CPU. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
if (next != rq->idle)
sched_info_arrive(rq, next);
}
+
static inline void
-sched_info_switch(struct rq *rq,
- struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
if (unlikely(sched_info_on()))
__sched_info_switch(rq, prev, next);
}
-#else
-#define sched_info_queued(rq, t) do { } while (0)
-#define sched_info_reset_dequeued(t) do { } while (0)
-#define sched_info_dequeued(rq, t) do { } while (0)
-#define sched_info_depart(rq, t) do { } while (0)
-#define sched_info_arrive(rq, next) do { } while (0)
-#define sched_info_switch(rq, t, next) do { } while (0)
+
+#else /* !CONFIG_SCHED_INFO: */
+# define sched_info_queued(rq, t) do { } while (0)
+# define sched_info_reset_dequeued(t) do { } while (0)
+# define sched_info_dequeued(rq, t) do { } while (0)
+# define sched_info_depart(rq, t) do { } while (0)
+# define sched_info_arrive(rq, next) do { } while (0)
+# define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 210b1f2146ff..c183b790ca54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,6 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
-#include "sched.h"
-
/*
* stop-task scheduling class.
*
@@ -9,6 +7,7 @@
*
* See kernel/stop_machine.c
*/
+#include "sched.h"
#ifdef CONFIG_SMP
static int
@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
cgroup_account_cputime(curr, delta_exec);
}
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 9ff1555341ed..b6fb2c3b3ff7 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/sched/signal.h>
-#include <linux/swait.h>
+/*
+ * <linux/swait.h> (simple wait queues ) implementation:
+ */
+#include "sched.h"
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 519b024f4e94..61a1125c1ae4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,10 +2,6 @@
/*
* Scheduler topology setup/handling methods
*/
-#include <linux/sched.h>
-#include <linux/mutex.h>
-#include <linux/sched/isolation.h>
-
#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
return -1;
}
@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- printk(KERN_ERR "ERROR: domain->span does not contain "
- "CPU%d\n", cpu);
+ printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
}
if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
- printk(KERN_ERR "ERROR: domain->groups does not contain"
- " CPU%d\n", cpu);
+ printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
}
printk(KERN_DEBUG "%*s groups:", level + 1, "");
@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (sd->parent &&
!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
- printk(KERN_ERR "ERROR: parent span is not a superset "
- "of domain->span\n");
+ printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
return 0;
}
@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg)
* are not.
*
* This leads to a few particularly weird cases where the sched_domain's are
- * not of the same number for each cpu. Consider:
+ * not of the same number for each CPU. Consider:
*
* NUMA-2 0-3 0-3
* groups: {0-2},{1-3} {1-3},{0-2}
@@ -780,7 +772,7 @@ fail:
* ^ ^ ^ ^
* `-' `-'
*
- * The sched_domains are per-cpu and have a two way link (parent & child) and
+ * The sched_domains are per-CPU and have a two way link (parent & child) and
* denote the ever growing mask of CPUs belonging to that level of topology.
*
* Each sched_domain has a circular (double) linked list of sched_group's, each
@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
d->rd = alloc_rootdomain();
if (!d->rd)
return sa_sd;
+
return sa_rootdomain;
}
@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
}
#ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
enum numa_topology_type sched_numa_topology_type;
-static int *sched_domains_numa_distance;
-int sched_max_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
-static int sched_domains_curr_level;
+
+static int sched_domains_numa_levels;
+static int sched_domains_curr_level;
+
+int sched_max_numa_distance;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
#endif
/*
@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level;
* SD_ASYM_PACKING - describes SMT quirks
*/
#define TOPOLOGY_SD_FLAGS \
- (SD_SHARE_CPUCAPACITY | \
+ (SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
- SD_NUMA | \
- SD_ASYM_PACKING | \
- SD_ASYM_CPUCAPACITY | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_ASYM_CPUCAPACITY | \
SD_SHARE_POWERDOMAIN)
static struct sched_domain *
@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
#endif
- /* Fixup, ensure @sd has at least @child cpus. */
+ /* Fixup, ensure @sd has at least @child CPUs. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
sched_domain_span(child));
@@ -1713,13 +1708,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
rcu_read_unlock();
if (rq && sched_debug_enabled) {
- pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+ pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}
ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
+
return ret;
}
@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
return 1;
tmp = SD_ATTR_INIT;
+
return !memcmp(cur ? (cur + idx_cur) : &tmp,
new ? (new + idx_new) : &tmp,
sizeof(struct sched_domain_attr));
@@ -1929,4 +1926,3 @@ match2:
mutex_unlock(&sched_domains_mutex);
}
-
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 929ecb7d6b78..928be527477e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -3,14 +3,7 @@
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/mm.h>
-#include <linux/wait.h>
-#include <linux/hash.h>
-#include <linux/kthread.h>
+#include "sched.h"
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
break;
}
}
+
return nr_exclusive;
}
@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
spin_unlock(&wq->lock);
schedule();
spin_lock(&wq->lock);
+
return 0;
}
EXPORT_SYMBOL(do_wait_intr);
@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
spin_unlock_irq(&wq->lock);
schedule();
spin_lock_irq(&wq->lock);
+
return 0;
}
EXPORT_SYMBOL(do_wait_intr_irq);
@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
if (ret)
list_del_init(&wq_entry->entry);
+
return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 84cb3acd9260..c67c6d24adc2 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,10 +1,7 @@
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
-#include <linux/wait_bit.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/hash.h>
+#include "sched.h"
#define WAIT_TABLE_BITS 8
#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
wait_bit->key.bit_nr != key->bit_nr ||
test_bit(key->bit_nr, key->flags))
return 0;
- else
- return autoremove_wake_function(wq_entry, mode, sync, key);
+
+ return autoremove_wake_function(wq_entry, mode, sync, key);
}
EXPORT_SYMBOL(wake_bit_function);
@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
} while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+
finish_wait(wq_head, &wbq_entry->wq_entry);
+
return ret;
}
EXPORT_SYMBOL(__wait_on_bit);
@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout(
DEFINE_WAIT_BIT(wq_entry, word, bit);
wq_entry.key.timeout = jiffies + timeout;
+
return __wait_on_bit(wq_head, &wq_entry, action, mode);
}
EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
{
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+
if (waitqueue_active(wq_head))
__wake_up(wq_head, TASK_NORMAL, 1, &key);
}
@@ -148,108 +149,55 @@ void wake_up_bit(void *word, int bit)
}
EXPORT_SYMBOL(wake_up_bit);
-/*
- * Manipulate the atomic_t address to produce a better bit waitqueue table hash
- * index (we're keying off bit -1, but that would produce a horrible hash
- * value).
- */
-static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
+wait_queue_head_t *__var_waitqueue(void *p)
{
- if (BITS_PER_LONG == 64) {
- unsigned long q = (unsigned long)p;
- return bit_waitqueue((void *)(q & ~1), q & 1);
- }
- return bit_waitqueue(p, 0);
+ return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
}
+EXPORT_SYMBOL(__var_waitqueue);
-static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync,
- void *arg)
+static int
+var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
+ int sync, void *arg)
{
struct wait_bit_key *key = arg;
- struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
- atomic_t *val = key->flags;
+ struct wait_bit_queue_entry *wbq_entry =
+ container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
- if (wait_bit->key.flags != key->flags ||
- wait_bit->key.bit_nr != key->bit_nr ||
- atomic_read(val) != 0)
+ if (wbq_entry->key.flags != key->flags ||
+ wbq_entry->key.bit_nr != key->bit_nr)
return 0;
- return autoremove_wake_function(wq_entry, mode, sync, key);
-}
-/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
- * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
- * return codes halt waiting and return.
- */
-static __sched
-int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
- wait_atomic_t_action_f action, unsigned int mode)
-{
- atomic_t *val;
- int ret = 0;
-
- do {
- prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
- val = wbq_entry->key.flags;
- if (atomic_read(val) == 0)
- break;
- ret = (*action)(val, mode);
- } while (!ret && atomic_read(val) != 0);
- finish_wait(wq_head, &wbq_entry->wq_entry);
- return ret;
+ return autoremove_wake_function(wq_entry, mode, sync, key);
}
-#define DEFINE_WAIT_ATOMIC_T(name, p) \
- struct wait_bit_queue_entry name = { \
- .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
- .wq_entry = { \
- .private = current, \
- .func = wake_atomic_t_function, \
- .entry = \
- LIST_HEAD_INIT((name).wq_entry.entry), \
- }, \
- }
-
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p,
- wait_atomic_t_action_f action,
- unsigned int mode)
+void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags)
{
- struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
- DEFINE_WAIT_ATOMIC_T(wq_entry, p);
-
- return __wait_on_atomic_t(wq_head, &wq_entry, action, mode);
+ *wbq_entry = (struct wait_bit_queue_entry){
+ .key = {
+ .flags = (var),
+ .bit_nr = -1,
+ },
+ .wq_entry = {
+ .private = current,
+ .func = var_wake_function,
+ .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
+ },
+ };
}
-EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+EXPORT_SYMBOL(init_wait_var_entry);
-__sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
+void wake_up_var(void *var)
{
- schedule();
- if (signal_pending_state(mode, current))
- return -EINTR;
- return 0;
+ __wake_up_bit(__var_waitqueue(var), var, -1);
}
-EXPORT_SYMBOL(atomic_t_wait);
-
-/**
- * wake_up_atomic_t - Wake up a waiter on a atomic_t
- * @p: The atomic_t being waited on, a kernel virtual address
- *
- * Wake up anyone waiting for the atomic_t to go to zero.
- *
- * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
- * check is done by the waiter's wake function, not the by the waker itself).
- */
-void wake_up_atomic_t(atomic_t *p)
-{
- __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
-}
-EXPORT_SYMBOL(wake_up_atomic_t);
+EXPORT_SYMBOL(wake_up_var);
__sched int bit_wait(struct wait_bit_key *word, int mode)
{
schedule();
if (signal_pending_state(mode, current))
return -EINTR;
+
return 0;
}
EXPORT_SYMBOL(bit_wait);
@@ -259,6 +207,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
io_schedule();
if (signal_pending_state(mode, current))
return -EINTR;
+
return 0;
}
EXPORT_SYMBOL(bit_wait_io);
@@ -266,11 +215,13 @@ EXPORT_SYMBOL(bit_wait_io);
__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
{
unsigned long now = READ_ONCE(jiffies);
+
if (time_after_eq(now, word->timeout))
return -EAGAIN;
schedule_timeout(word->timeout - now);
if (signal_pending_state(mode, current))
return -EINTR;
+
return 0;
}
EXPORT_SYMBOL_GPL(bit_wait_timeout);
@@ -278,11 +229,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
{
unsigned long now = READ_ONCE(jiffies);
+
if (time_after_eq(now, word->timeout))
return -EAGAIN;
io_schedule_timeout(word->timeout - now);
if (signal_pending_state(mode, current))
return -EINTR;
+
return 0;
}
EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dc77548167ef..fd023ac24e10 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -19,6 +19,8 @@
#include <linux/compat.h>
#include <linux/coredump.h>
#include <linux/kmemleak.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
@@ -227,8 +229,11 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
return true;
}
+void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
+
static inline void seccomp_assign_mode(struct task_struct *task,
- unsigned long seccomp_mode)
+ unsigned long seccomp_mode,
+ unsigned long flags)
{
assert_spin_locked(&task->sighand->siglock);
@@ -238,6 +243,9 @@ static inline void seccomp_assign_mode(struct task_struct *task,
* filter) is set.
*/
smp_mb__before_atomic();
+ /* Assume default seccomp processes want spec flaw mitigation. */
+ if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
+ arch_seccomp_spec_mitigate(task);
set_tsk_thread_flag(task, TIF_SECCOMP);
}
@@ -305,7 +313,7 @@ static inline pid_t seccomp_can_sync_threads(void)
* without dropping the locks.
*
*/
-static inline void seccomp_sync_threads(void)
+static inline void seccomp_sync_threads(unsigned long flags)
{
struct task_struct *thread, *caller;
@@ -346,7 +354,8 @@ static inline void seccomp_sync_threads(void)
* allow one thread to transition the other.
*/
if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
- seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+ seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
+ flags);
}
}
@@ -469,7 +478,7 @@ static long seccomp_attach_filter(unsigned int flags,
/* Now that the new filter is in place, synchronize to all threads. */
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
- seccomp_sync_threads();
+ seccomp_sync_threads(flags);
return 0;
}
@@ -584,18 +593,15 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
}
/*
- * Force an audit message to be emitted when the action is RET_KILL_*,
- * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
- * allowed to be logged by the admin.
+ * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
+ * FILTER_FLAG_LOG bit was set. The admin has the ability to silence
+ * any action from being logged by removing the action name from the
+ * seccomp_actions_logged sysctl.
*/
- if (log)
- return __audit_seccomp(syscall, signr, action);
+ if (!log)
+ return;
- /*
- * Let the audit subsystem decide if the action should be audited based
- * on whether the current task itself is being audited.
- */
- return audit_seccomp(syscall, signr, action);
+ audit_seccomp(syscall, signr, action);
}
/*
@@ -818,7 +824,7 @@ static long seccomp_set_mode_strict(void)
#ifdef TIF_NOTSC
disable_TSC();
#endif
- seccomp_assign_mode(current, seccomp_mode);
+ seccomp_assign_mode(current, seccomp_mode, 0);
ret = 0;
out:
@@ -876,7 +882,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
/* Do not free the successfully attached filter. */
prepared = NULL;
- seccomp_assign_mode(current, seccomp_mode);
+ seccomp_assign_mode(current, seccomp_mode, flags);
out:
spin_unlock_irq(&current->sighand->siglock);
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
@@ -1135,10 +1141,11 @@ static const struct seccomp_log_name seccomp_log_names[] = {
};
static bool seccomp_names_from_actions_logged(char *names, size_t size,
- u32 actions_logged)
+ u32 actions_logged,
+ const char *sep)
{
const struct seccomp_log_name *cur;
- bool append_space = false;
+ bool append_sep = false;
for (cur = seccomp_log_names; cur->name && size; cur++) {
ssize_t ret;
@@ -1146,15 +1153,15 @@ static bool seccomp_names_from_actions_logged(char *names, size_t size,
if (!(actions_logged & cur->log))
continue;
- if (append_space) {
- ret = strscpy(names, " ", size);
+ if (append_sep) {
+ ret = strscpy(names, sep, size);
if (ret < 0)
return false;
names += ret;
size -= ret;
} else
- append_space = true;
+ append_sep = true;
ret = strscpy(names, cur->name, size);
if (ret < 0)
@@ -1199,46 +1206,102 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
return true;
}
-static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ char names[sizeof(seccomp_actions_avail)];
+ struct ctl_table table;
+
+ memset(names, 0, sizeof(names));
+
+ if (!seccomp_names_from_actions_logged(names, sizeof(names),
+ seccomp_actions_logged, " "))
+ return -EINVAL;
+
+ table = *ro_table;
+ table.data = names;
+ table.maxlen = sizeof(names);
+ return proc_dostring(&table, 0, buffer, lenp, ppos);
+}
+
+static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+ size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
char names[sizeof(seccomp_actions_avail)];
struct ctl_table table;
int ret;
- if (write && !capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN))
return -EPERM;
memset(names, 0, sizeof(names));
- if (!write) {
- if (!seccomp_names_from_actions_logged(names, sizeof(names),
- seccomp_actions_logged))
- return -EINVAL;
- }
-
table = *ro_table;
table.data = names;
table.maxlen = sizeof(names);
- ret = proc_dostring(&table, write, buffer, lenp, ppos);
+ ret = proc_dostring(&table, 1, buffer, lenp, ppos);
if (ret)
return ret;
- if (write) {
- u32 actions_logged;
+ if (!seccomp_actions_logged_from_names(actions_logged, table.data))
+ return -EINVAL;
- if (!seccomp_actions_logged_from_names(&actions_logged,
- table.data))
- return -EINVAL;
+ if (*actions_logged & SECCOMP_LOG_ALLOW)
+ return -EINVAL;
- if (actions_logged & SECCOMP_LOG_ALLOW)
- return -EINVAL;
+ seccomp_actions_logged = *actions_logged;
+ return 0;
+}
- seccomp_actions_logged = actions_logged;
- }
+static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
+ int ret)
+{
+ char names[sizeof(seccomp_actions_avail)];
+ char old_names[sizeof(seccomp_actions_avail)];
+ const char *new = names;
+ const char *old = old_names;
- return 0;
+ if (!audit_enabled)
+ return;
+
+ memset(names, 0, sizeof(names));
+ memset(old_names, 0, sizeof(old_names));
+
+ if (ret)
+ new = "?";
+ else if (!actions_logged)
+ new = "(none)";
+ else if (!seccomp_names_from_actions_logged(names, sizeof(names),
+ actions_logged, ","))
+ new = "?";
+
+ if (!old_actions_logged)
+ old = "(none)";
+ else if (!seccomp_names_from_actions_logged(old_names,
+ sizeof(old_names),
+ old_actions_logged, ","))
+ old = "?";
+
+ return audit_seccomp_actions_logged(new, old, !ret);
+}
+
+static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ if (write) {
+ u32 actions_logged = 0;
+ u32 old_actions_logged = seccomp_actions_logged;
+
+ ret = write_actions_logged(ro_table, buffer, lenp, ppos,
+ &actions_logged);
+ audit_actions_logged(actions_logged, old_actions_logged, ret);
+ } else
+ ret = read_actions_logged(ro_table, buffer, lenp, ppos);
+
+ return ret;
}
static struct ctl_path seccomp_sysctl_path[] = {
diff --git a/kernel/signal.c b/kernel/signal.c
index c6e4c83dc090..0f865d67415d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -770,7 +770,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
}
}
- return security_task_kill(t, info, sig, 0);
+ return security_task_kill(t, info, sig, NULL);
}
/**
@@ -1361,7 +1361,7 @@ static int kill_as_cred_perm(const struct cred *cred,
/* like kill_pid_info(), but doesn't use uid/euid of "current" */
int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
- const struct cred *cred, u32 secid)
+ const struct cred *cred)
{
int ret = -EINVAL;
struct task_struct *p;
@@ -1380,7 +1380,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
ret = -EPERM;
goto out_unlock;
}
- ret = security_task_kill(p, info, sig, secid);
+ ret = security_task_kill(p, info, sig, cred);
if (ret)
goto out_unlock;
@@ -1539,7 +1539,6 @@ int send_sig_fault(int sig, int code, void __user *addr
return send_sig_info(info.si_signo, &info, t);
}
-#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR)
int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
{
struct siginfo info;
@@ -1568,9 +1567,7 @@ int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *
return send_sig_info(info.si_signo, &info, t);
}
EXPORT_SYMBOL(send_sig_mceerr);
-#endif
-#ifdef SEGV_BNDERR
int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
{
struct siginfo info;
@@ -1584,7 +1581,6 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
info.si_upper = upper;
return force_sig_info(info.si_signo, &info, current);
}
-#endif
#ifdef SEGV_PKUERR
int force_sig_pkuerr(void __user *addr, u32 pkey)
@@ -1961,14 +1957,27 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
return;
}
+ set_special_state(TASK_TRACED);
+
/*
* We're committing to trapping. TRACED should be visible before
* TRAPPING is cleared; otherwise, the tracer might fail do_wait().
* Also, transition to TRACED and updates to ->jobctl should be
* atomic with respect to siglock and should be done after the arch
* hook as siglock is released and regrabbed across it.
+ *
+ * TRACER TRACEE
+ *
+ * ptrace_attach()
+ * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED)
+ * do_wait()
+ * set_current_state() smp_wmb();
+ * ptrace_do_wait()
+ * wait_task_stopped()
+ * task_stopped_code()
+ * [L] task_is_traced() [S] task_clear_jobctl_trapping();
*/
- set_current_state(TASK_TRACED);
+ smp_wmb();
current->last_siginfo = info;
current->exit_code = exit_code;
@@ -2176,7 +2185,7 @@ static bool do_signal_stop(int signr)
if (task_participate_group_stop(current))
notify = CLD_STOPPED;
- __set_current_state(TASK_STOPPED);
+ set_special_state(TASK_STOPPED);
spin_unlock_irq(&current->sighand->siglock);
/*
@@ -2824,8 +2833,19 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
[SIGPOLL] = { NSIGPOLL, SIL_POLL },
[SIGSYS] = { NSIGSYS, SIL_SYS },
};
- if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
+ if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) {
layout = filter[sig].layout;
+ /* Handle the exceptions */
+ if ((sig == SIGBUS) &&
+ (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
+ layout = SIL_FAULT_MCEERR;
+ else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
+ layout = SIL_FAULT_BNDERR;
+#ifdef SEGV_PKUERR
+ else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
+ layout = SIL_FAULT_PKUERR;
+#endif
+ }
else if (si_code <= NSIGPOLL)
layout = SIL_POLL;
} else {
@@ -2835,108 +2855,15 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
layout = SIL_POLL;
else if (si_code < 0)
layout = SIL_RT;
- /* Tests to support buggy kernel ABIs */
-#ifdef TRAP_FIXME
- if ((sig == SIGTRAP) && (si_code == TRAP_FIXME))
- layout = SIL_FAULT;
-#endif
-#ifdef FPE_FIXME
- if ((sig == SIGFPE) && (si_code == FPE_FIXME))
- layout = SIL_FAULT;
-#endif
-#ifdef BUS_FIXME
- if ((sig == SIGBUS) && (si_code == BUS_FIXME))
- layout = SIL_FAULT;
-#endif
}
return layout;
}
int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
{
- int err;
-
- if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
+ if (copy_to_user(to, from , sizeof(struct siginfo)))
return -EFAULT;
- if (from->si_code < 0)
- return __copy_to_user(to, from, sizeof(siginfo_t))
- ? -EFAULT : 0;
- /*
- * If you change siginfo_t structure, please be sure
- * this code is fixed accordingly.
- * Please remember to update the signalfd_copyinfo() function
- * inside fs/signalfd.c too, in case siginfo_t changes.
- * It should never copy any pad contained in the structure
- * to avoid security leaks, but must copy the generic
- * 3 ints plus the relevant union member.
- */
- err = __put_user(from->si_signo, &to->si_signo);
- err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user(from->si_code, &to->si_code);
- switch (siginfo_layout(from->si_signo, from->si_code)) {
- case SIL_KILL:
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- break;
- case SIL_TIMER:
- /* Unreached SI_TIMER is negative */
- break;
- case SIL_POLL:
- err |= __put_user(from->si_band, &to->si_band);
- err |= __put_user(from->si_fd, &to->si_fd);
- break;
- case SIL_FAULT:
- err |= __put_user(from->si_addr, &to->si_addr);
-#ifdef __ARCH_SI_TRAPNO
- err |= __put_user(from->si_trapno, &to->si_trapno);
-#endif
-#ifdef __ia64__
- err |= __put_user(from->si_imm, &to->si_imm);
- err |= __put_user(from->si_flags, &to->si_flags);
- err |= __put_user(from->si_isr, &to->si_isr);
-#endif
- /*
- * Other callers might not initialize the si_lsb field,
- * so check explicitly for the right codes here.
- */
-#ifdef BUS_MCEERR_AR
- if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR)
- err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
-#endif
-#ifdef BUS_MCEERR_AO
- if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO)
- err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
-#endif
-#ifdef SEGV_BNDERR
- if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) {
- err |= __put_user(from->si_lower, &to->si_lower);
- err |= __put_user(from->si_upper, &to->si_upper);
- }
-#endif
-#ifdef SEGV_PKUERR
- if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR)
- err |= __put_user(from->si_pkey, &to->si_pkey);
-#endif
- break;
- case SIL_CHLD:
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- err |= __put_user(from->si_status, &to->si_status);
- err |= __put_user(from->si_utime, &to->si_utime);
- err |= __put_user(from->si_stime, &to->si_stime);
- break;
- case SIL_RT:
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- err |= __put_user(from->si_ptr, &to->si_ptr);
- break;
- case SIL_SYS:
- err |= __put_user(from->si_call_addr, &to->si_call_addr);
- err |= __put_user(from->si_syscall, &to->si_syscall);
- err |= __put_user(from->si_arch, &to->si_arch);
- break;
- }
- return err;
+ return 0;
}
#ifdef CONFIG_COMPAT
@@ -2975,27 +2902,28 @@ int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
#ifdef __ARCH_SI_TRAPNO
new.si_trapno = from->si_trapno;
#endif
-#ifdef BUS_MCEERR_AR
- if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR))
- new.si_addr_lsb = from->si_addr_lsb;
-#endif
-#ifdef BUS_MCEERR_AO
- if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO))
- new.si_addr_lsb = from->si_addr_lsb;
+ break;
+ case SIL_FAULT_MCEERR:
+ new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ new.si_trapno = from->si_trapno;
#endif
-#ifdef SEGV_BNDERR
- if ((from->si_signo == SIGSEGV) &&
- (from->si_code == SEGV_BNDERR)) {
- new.si_lower = ptr_to_compat(from->si_lower);
- new.si_upper = ptr_to_compat(from->si_upper);
- }
+ new.si_addr_lsb = from->si_addr_lsb;
+ break;
+ case SIL_FAULT_BNDERR:
+ new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ new.si_trapno = from->si_trapno;
#endif
-#ifdef SEGV_PKUERR
- if ((from->si_signo == SIGSEGV) &&
- (from->si_code == SEGV_PKUERR))
- new.si_pkey = from->si_pkey;
+ new.si_lower = ptr_to_compat(from->si_lower);
+ new.si_upper = ptr_to_compat(from->si_upper);
+ break;
+ case SIL_FAULT_PKUERR:
+ new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ new.si_trapno = from->si_trapno;
#endif
-
+ new.si_pkey = from->si_pkey;
break;
case SIL_CHLD:
new.si_pid = from->si_pid;
@@ -3061,24 +2989,28 @@ int copy_siginfo_from_user32(struct siginfo *to,
#ifdef __ARCH_SI_TRAPNO
to->si_trapno = from.si_trapno;
#endif
-#ifdef BUS_MCEERR_AR
- if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR))
- to->si_addr_lsb = from.si_addr_lsb;
-#endif
-#ifdef BUS_MCEER_AO
- if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO))
- to->si_addr_lsb = from.si_addr_lsb;
+ break;
+ case SIL_FAULT_MCEERR:
+ to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ to->si_trapno = from.si_trapno;
#endif
-#ifdef SEGV_BNDERR
- if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) {
- to->si_lower = compat_ptr(from.si_lower);
- to->si_upper = compat_ptr(from.si_upper);
- }
+ to->si_addr_lsb = from.si_addr_lsb;
+ break;
+ case SIL_FAULT_BNDERR:
+ to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ to->si_trapno = from.si_trapno;
#endif
-#ifdef SEGV_PKUERR
- if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR))
- to->si_pkey = from.si_pkey;
+ to->si_lower = compat_ptr(from.si_lower);
+ to->si_upper = compat_ptr(from.si_upper);
+ break;
+ case SIL_FAULT_PKUERR:
+ to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ to->si_trapno = from.si_trapno;
#endif
+ to->si_pkey = from.si_pkey;
break;
case SIL_CHLD:
to->si_pid = from.si_pid;
@@ -3573,9 +3505,8 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
}
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE2(sigaltstack,
- const compat_stack_t __user *, uss_ptr,
- compat_stack_t __user *, uoss_ptr)
+static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr,
+ compat_stack_t __user *uoss_ptr)
{
stack_t uss, uoss;
int ret;
@@ -3602,9 +3533,16 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack,
return ret;
}
+COMPAT_SYSCALL_DEFINE2(sigaltstack,
+ const compat_stack_t __user *, uss_ptr,
+ compat_stack_t __user *, uoss_ptr)
+{
+ return do_compat_sigaltstack(uss_ptr, uoss_ptr);
+}
+
int compat_restore_altstack(const compat_stack_t __user *uss)
{
- int err = compat_sys_sigaltstack(uss, NULL);
+ int err = do_compat_sigaltstack(uss, NULL);
/* squash all but -EFAULT for now */
return err == -EFAULT ? err : 0;
}
@@ -3629,11 +3567,20 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
/**
* sys_sigpending - examine pending signals
- * @set: where mask of pending signal is returned
+ * @uset: where mask of pending signal is returned
*/
-SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
+SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
{
- return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
+ sigset_t set;
+ int err;
+
+ if (sizeof(old_sigset_t) > sizeof(*uset))
+ return -EINVAL;
+
+ err = do_sigpending(&set);
+ if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t)))
+ err = -EFAULT;
+ return err;
}
#ifdef CONFIG_COMPAT
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 24d243ef8e71..de2f57fddc04 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -49,8 +49,8 @@
*/
#ifndef __ARCH_IRQ_STAT
-irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
-EXPORT_SYMBOL(irq_stat);
+DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
#endif
static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
@@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt)
}
/*
- * Special-case - softirqs can safely be enabled in
- * cond_resched_softirq(), or by __do_softirq(),
+ * Special-case - softirqs can safely be enabled by __do_softirq(),
* without processing still-pending softirqs:
*/
void _local_bh_enable(void)
@@ -460,40 +459,46 @@ struct tasklet_head {
static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
-void __tasklet_schedule(struct tasklet_struct *t)
+static void __tasklet_schedule_common(struct tasklet_struct *t,
+ struct tasklet_head __percpu *headp,
+ unsigned int softirq_nr)
{
+ struct tasklet_head *head;
unsigned long flags;
local_irq_save(flags);
+ head = this_cpu_ptr(headp);
t->next = NULL;
- *__this_cpu_read(tasklet_vec.tail) = t;
- __this_cpu_write(tasklet_vec.tail, &(t->next));
- raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ *head->tail = t;
+ head->tail = &(t->next);
+ raise_softirq_irqoff(softirq_nr);
local_irq_restore(flags);
}
+
+void __tasklet_schedule(struct tasklet_struct *t)
+{
+ __tasklet_schedule_common(t, &tasklet_vec,
+ TASKLET_SOFTIRQ);
+}
EXPORT_SYMBOL(__tasklet_schedule);
void __tasklet_hi_schedule(struct tasklet_struct *t)
{
- unsigned long flags;
-
- local_irq_save(flags);
- t->next = NULL;
- *__this_cpu_read(tasklet_hi_vec.tail) = t;
- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
- raise_softirq_irqoff(HI_SOFTIRQ);
- local_irq_restore(flags);
+ __tasklet_schedule_common(t, &tasklet_hi_vec,
+ HI_SOFTIRQ);
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
-static __latent_entropy void tasklet_action(struct softirq_action *a)
+static void tasklet_action_common(struct softirq_action *a,
+ struct tasklet_head *tl_head,
+ unsigned int softirq_nr)
{
struct tasklet_struct *list;
local_irq_disable();
- list = __this_cpu_read(tasklet_vec.head);
- __this_cpu_write(tasklet_vec.head, NULL);
- __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
+ list = tl_head->head;
+ tl_head->head = NULL;
+ tl_head->tail = &tl_head->head;
local_irq_enable();
while (list) {
@@ -515,47 +520,21 @@ static __latent_entropy void tasklet_action(struct softirq_action *a)
local_irq_disable();
t->next = NULL;
- *__this_cpu_read(tasklet_vec.tail) = t;
- __this_cpu_write(tasklet_vec.tail, &(t->next));
- __raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ *tl_head->tail = t;
+ tl_head->tail = &t->next;
+ __raise_softirq_irqoff(softirq_nr);
local_irq_enable();
}
}
-static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
+static __latent_entropy void tasklet_action(struct softirq_action *a)
{
- struct tasklet_struct *list;
-
- local_irq_disable();
- list = __this_cpu_read(tasklet_hi_vec.head);
- __this_cpu_write(tasklet_hi_vec.head, NULL);
- __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
- local_irq_enable();
-
- while (list) {
- struct tasklet_struct *t = list;
-
- list = list->next;
-
- if (tasklet_trylock(t)) {
- if (!atomic_read(&t->count)) {
- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
- &t->state))
- BUG();
- t->func(t->data);
- tasklet_unlock(t);
- continue;
- }
- tasklet_unlock(t);
- }
+ tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
+}
- local_irq_disable();
- t->next = NULL;
- *__this_cpu_read(tasklet_hi_vec.tail) = t;
- __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
- __raise_softirq_irqoff(HI_SOFTIRQ);
- local_irq_enable();
- }
+static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
+{
+ tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
}
void tasklet_init(struct tasklet_struct *t,
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b7591261652d..f89014a2c238 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -21,6 +21,7 @@
#include <linux/smpboot.h>
#include <linux/atomic.h>
#include <linux/nmi.h>
+#include <linux/sched/wake_q.h>
/*
* Structure to determine completion condition and record errors. May
@@ -36,7 +37,7 @@ struct cpu_stop_done {
struct cpu_stopper {
struct task_struct *thread;
- spinlock_t lock;
+ raw_spinlock_t lock;
bool enabled; /* is this stopper enabled? */
struct list_head works; /* list of pending works */
@@ -65,26 +66,30 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
}
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
- struct cpu_stop_work *work)
+ struct cpu_stop_work *work,
+ struct wake_q_head *wakeq)
{
list_add_tail(&work->list, &stopper->works);
- wake_up_process(stopper->thread);
+ wake_q_add(wakeq, stopper->thread);
}
/* queue @work to @stopper. if offline, @work is completed immediately */
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ DEFINE_WAKE_Q(wakeq);
unsigned long flags;
bool enabled;
- spin_lock_irqsave(&stopper->lock, flags);
+ raw_spin_lock_irqsave(&stopper->lock, flags);
enabled = stopper->enabled;
if (enabled)
- __cpu_stop_queue_work(stopper, work);
+ __cpu_stop_queue_work(stopper, work, &wakeq);
else if (work->done)
cpu_stop_signal_done(work->done);
- spin_unlock_irqrestore(&stopper->lock, flags);
+ raw_spin_unlock_irqrestore(&stopper->lock, flags);
+
+ wake_up_q(&wakeq);
return enabled;
}
@@ -229,10 +234,11 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
{
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+ DEFINE_WAKE_Q(wakeq);
int err;
retry:
- spin_lock_irq(&stopper1->lock);
- spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_irq(&stopper1->lock);
+ raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
err = -ENOENT;
if (!stopper1->enabled || !stopper2->enabled)
@@ -252,17 +258,20 @@ retry:
goto unlock;
err = 0;
- __cpu_stop_queue_work(stopper1, work1);
- __cpu_stop_queue_work(stopper2, work2);
+ __cpu_stop_queue_work(stopper1, work1, &wakeq);
+ __cpu_stop_queue_work(stopper2, work2, &wakeq);
unlock:
- spin_unlock(&stopper2->lock);
- spin_unlock_irq(&stopper1->lock);
+ raw_spin_unlock(&stopper2->lock);
+ raw_spin_unlock_irq(&stopper1->lock);
if (unlikely(err == -EDEADLK)) {
while (stop_cpus_in_progress)
cpu_relax();
goto retry;
}
+
+ wake_up_q(&wakeq);
+
return err;
}
/**
@@ -448,9 +457,9 @@ static int cpu_stop_should_run(unsigned int cpu)
unsigned long flags;
int run;
- spin_lock_irqsave(&stopper->lock, flags);
+ raw_spin_lock_irqsave(&stopper->lock, flags);
run = !list_empty(&stopper->works);
- spin_unlock_irqrestore(&stopper->lock, flags);
+ raw_spin_unlock_irqrestore(&stopper->lock, flags);
return run;
}
@@ -461,13 +470,13 @@ static void cpu_stopper_thread(unsigned int cpu)
repeat:
work = NULL;
- spin_lock_irq(&stopper->lock);
+ raw_spin_lock_irq(&stopper->lock);
if (!list_empty(&stopper->works)) {
work = list_first_entry(&stopper->works,
struct cpu_stop_work, list);
list_del_init(&work->list);
}
- spin_unlock_irq(&stopper->lock);
+ raw_spin_unlock_irq(&stopper->lock);
if (work) {
cpu_stop_fn_t fn = work->fn;
@@ -541,7 +550,7 @@ static int __init cpu_stop_init(void)
for_each_possible_cpu(cpu) {
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- spin_lock_init(&stopper->lock);
+ raw_spin_lock_init(&stopper->lock);
INIT_LIST_HEAD(&stopper->works);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index f2289de20e19..d1b2b8d934bb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -61,6 +61,8 @@
#include <linux/uidgid.h>
#include <linux/cred.h>
+#include <linux/nospec.h>
+
#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>
@@ -69,6 +71,11 @@
#include <asm/io.h>
#include <asm/unistd.h>
+/* Hardening for Spectre-v1 */
+#include <linux/nospec.h>
+
+#include "uid16.h"
+
#ifndef SET_UNALIGN_CTL
# define SET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
@@ -340,7 +347,7 @@ out_unlock:
* operations (as far as semantic preservation is concerned).
*/
#ifdef CONFIG_MULTIUSER
-SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
+long __sys_setregid(gid_t rgid, gid_t egid)
{
struct user_namespace *ns = current_user_ns();
const struct cred *old;
@@ -392,12 +399,17 @@ error:
return retval;
}
+SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
+{
+ return __sys_setregid(rgid, egid);
+}
+
/*
* setgid() is implemented like SysV w/ SAVED_IDS
*
* SMP: Same implicit races as above.
*/
-SYSCALL_DEFINE1(setgid, gid_t, gid)
+long __sys_setgid(gid_t gid)
{
struct user_namespace *ns = current_user_ns();
const struct cred *old;
@@ -429,6 +441,11 @@ error:
return retval;
}
+SYSCALL_DEFINE1(setgid, gid_t, gid)
+{
+ return __sys_setgid(gid);
+}
+
/*
* change the user struct in a credentials set to match the new UID
*/
@@ -473,7 +490,7 @@ static int set_user(struct cred *new)
* 100% compatible with BSD. A program which uses just setuid() will be
* 100% compatible with POSIX with saved IDs.
*/
-SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
+long __sys_setreuid(uid_t ruid, uid_t euid)
{
struct user_namespace *ns = current_user_ns();
const struct cred *old;
@@ -533,6 +550,11 @@ error:
return retval;
}
+SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
+{
+ return __sys_setreuid(ruid, euid);
+}
+
/*
* setuid() is implemented like SysV with SAVED_IDS
*
@@ -544,7 +566,7 @@ error:
* will allow a root program to temporarily drop privileges and be able to
* regain them by swapping the real and effective uid.
*/
-SYSCALL_DEFINE1(setuid, uid_t, uid)
+long __sys_setuid(uid_t uid)
{
struct user_namespace *ns = current_user_ns();
const struct cred *old;
@@ -586,12 +608,17 @@ error:
return retval;
}
+SYSCALL_DEFINE1(setuid, uid_t, uid)
+{
+ return __sys_setuid(uid);
+}
+
/*
* This function implements a generic ability to update ruid, euid,
* and suid. This allows you to implement the 4.4 compatible seteuid().
*/
-SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
+long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
{
struct user_namespace *ns = current_user_ns();
const struct cred *old;
@@ -656,6 +683,11 @@ error:
return retval;
}
+SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
+{
+ return __sys_setresuid(ruid, euid, suid);
+}
+
SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
{
const struct cred *cred = current_cred();
@@ -678,7 +710,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
/*
* Same as above, but for rgid, egid, sgid.
*/
-SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
+long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
{
struct user_namespace *ns = current_user_ns();
const struct cred *old;
@@ -730,6 +762,11 @@ error:
return retval;
}
+SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
+{
+ return __sys_setresgid(rgid, egid, sgid);
+}
+
SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
{
const struct cred *cred = current_cred();
@@ -757,7 +794,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
* whatever uid it wants to). It normally shadows "euid", except when
* explicitly set by setfsuid() or for access..
*/
-SYSCALL_DEFINE1(setfsuid, uid_t, uid)
+long __sys_setfsuid(uid_t uid)
{
const struct cred *old;
struct cred *new;
@@ -793,10 +830,15 @@ change_okay:
return old_fsuid;
}
+SYSCALL_DEFINE1(setfsuid, uid_t, uid)
+{
+ return __sys_setfsuid(uid);
+}
+
/*
* Samma på svenska..
*/
-SYSCALL_DEFINE1(setfsgid, gid_t, gid)
+long __sys_setfsgid(gid_t gid)
{
const struct cred *old;
struct cred *new;
@@ -830,6 +872,11 @@ change_okay:
commit_creds(new);
return old_fsgid;
}
+
+SYSCALL_DEFINE1(setfsgid, gid_t, gid)
+{
+ return __sys_setfsgid(gid);
+}
#endif /* CONFIG_MULTIUSER */
/**
@@ -1027,7 +1074,7 @@ out:
return err;
}
-SYSCALL_DEFINE1(getpgid, pid_t, pid)
+static int do_getpgid(pid_t pid)
{
struct task_struct *p;
struct pid *grp;
@@ -1055,11 +1102,16 @@ out:
return retval;
}
+SYSCALL_DEFINE1(getpgid, pid_t, pid)
+{
+ return do_getpgid(pid);
+}
+
#ifdef __ARCH_WANT_SYS_GETPGRP
SYSCALL_DEFINE0(getpgrp)
{
- return sys_getpgid(0);
+ return do_getpgid(0);
}
#endif
@@ -1103,7 +1155,7 @@ static void set_special_pids(struct pid *pid)
change_pid(curr, PIDTYPE_PGID, pid);
}
-SYSCALL_DEFINE0(setsid)
+int ksys_setsid(void)
{
struct task_struct *group_leader = current->group_leader;
struct pid *sid = task_pid(group_leader);
@@ -1136,6 +1188,11 @@ out:
return err;
}
+SYSCALL_DEFINE0(setsid)
+{
+ return ksys_setsid();
+}
+
DECLARE_RWSEM(uts_sem);
#ifdef COMPAT_UTS_MACHINE
@@ -1399,6 +1456,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
if (resource >= RLIM_NLIMITS)
return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
task_lock(current->group_leader);
x = current->signal->rlim[resource];
task_unlock(current->group_leader);
@@ -1418,6 +1476,7 @@ COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
if (resource >= RLIM_NLIMITS)
return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
task_lock(current->group_leader);
r = current->signal->rlim[resource];
task_unlock(current->group_leader);
@@ -2190,6 +2249,17 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data)
return 1;
}
+int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
+{
+ return -EINVAL;
+}
+
+int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
+ unsigned long ctrl)
+{
+ return -EINVAL;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2398,6 +2468,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SVE_GET_VL:
error = SVE_GET_VL();
break;
+ case PR_GET_SPECULATION_CTRL:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_spec_ctrl_get(me, arg2);
+ break;
+ case PR_SET_SPECULATION_CTRL:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b5189762d275..183169c2a75b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -5,6 +5,11 @@
#include <asm/unistd.h>
+#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
+/* Architectures may override COND_SYSCALL and COND_SYSCALL_COMPAT */
+#include <asm/syscall_wrapper.h>
+#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */
+
/* we can't #include <linux/syscalls.h> here,
but tell gcc to not warn with -Wmissing-prototypes */
asmlinkage long sys_ni_syscall(void);
@@ -17,245 +22,413 @@ asmlinkage long sys_ni_syscall(void)
return -ENOSYS;
}
-cond_syscall(sys_quotactl);
-cond_syscall(sys32_quotactl);
-cond_syscall(sys_acct);
-cond_syscall(sys_lookup_dcookie);
-cond_syscall(compat_sys_lookup_dcookie);
-cond_syscall(sys_swapon);
-cond_syscall(sys_swapoff);
-cond_syscall(sys_kexec_load);
-cond_syscall(compat_sys_kexec_load);
-cond_syscall(sys_kexec_file_load);
-cond_syscall(sys_init_module);
-cond_syscall(sys_finit_module);
-cond_syscall(sys_delete_module);
-cond_syscall(sys_socketpair);
-cond_syscall(sys_bind);
-cond_syscall(sys_listen);
-cond_syscall(sys_accept);
-cond_syscall(sys_accept4);
-cond_syscall(sys_connect);
-cond_syscall(sys_getsockname);
-cond_syscall(sys_getpeername);
-cond_syscall(sys_sendto);
-cond_syscall(sys_send);
-cond_syscall(sys_recvfrom);
-cond_syscall(sys_recv);
-cond_syscall(sys_socket);
-cond_syscall(sys_setsockopt);
-cond_syscall(compat_sys_setsockopt);
-cond_syscall(sys_getsockopt);
-cond_syscall(compat_sys_getsockopt);
-cond_syscall(sys_shutdown);
-cond_syscall(sys_sendmsg);
-cond_syscall(sys_sendmmsg);
-cond_syscall(compat_sys_sendmsg);
-cond_syscall(compat_sys_sendmmsg);
-cond_syscall(sys_recvmsg);
-cond_syscall(sys_recvmmsg);
-cond_syscall(compat_sys_recvmsg);
-cond_syscall(compat_sys_recv);
-cond_syscall(compat_sys_recvfrom);
-cond_syscall(compat_sys_recvmmsg);
-cond_syscall(sys_socketcall);
-cond_syscall(sys_futex);
-cond_syscall(compat_sys_futex);
-cond_syscall(sys_set_robust_list);
-cond_syscall(compat_sys_set_robust_list);
-cond_syscall(sys_get_robust_list);
-cond_syscall(compat_sys_get_robust_list);
-cond_syscall(sys_epoll_create);
-cond_syscall(sys_epoll_create1);
-cond_syscall(sys_epoll_ctl);
-cond_syscall(sys_epoll_wait);
-cond_syscall(sys_epoll_pwait);
-cond_syscall(compat_sys_epoll_pwait);
-cond_syscall(sys_semget);
-cond_syscall(sys_semop);
-cond_syscall(sys_semtimedop);
-cond_syscall(compat_sys_semtimedop);
-cond_syscall(sys_semctl);
-cond_syscall(compat_sys_semctl);
-cond_syscall(sys_msgget);
-cond_syscall(sys_msgsnd);
-cond_syscall(compat_sys_msgsnd);
-cond_syscall(sys_msgrcv);
-cond_syscall(compat_sys_msgrcv);
-cond_syscall(sys_msgctl);
-cond_syscall(compat_sys_msgctl);
-cond_syscall(sys_shmget);
-cond_syscall(sys_shmat);
-cond_syscall(compat_sys_shmat);
-cond_syscall(sys_shmdt);
-cond_syscall(sys_shmctl);
-cond_syscall(compat_sys_shmctl);
-cond_syscall(sys_mq_open);
-cond_syscall(sys_mq_unlink);
-cond_syscall(sys_mq_timedsend);
-cond_syscall(sys_mq_timedreceive);
-cond_syscall(sys_mq_notify);
-cond_syscall(sys_mq_getsetattr);
-cond_syscall(compat_sys_mq_open);
-cond_syscall(compat_sys_mq_timedsend);
-cond_syscall(compat_sys_mq_timedreceive);
-cond_syscall(compat_sys_mq_notify);
-cond_syscall(compat_sys_mq_getsetattr);
-cond_syscall(sys_mbind);
-cond_syscall(sys_get_mempolicy);
-cond_syscall(sys_set_mempolicy);
-cond_syscall(compat_sys_mbind);
-cond_syscall(compat_sys_get_mempolicy);
-cond_syscall(compat_sys_set_mempolicy);
-cond_syscall(sys_add_key);
-cond_syscall(sys_request_key);
-cond_syscall(sys_keyctl);
-cond_syscall(compat_sys_keyctl);
-cond_syscall(compat_sys_socketcall);
-cond_syscall(sys_inotify_init);
-cond_syscall(sys_inotify_init1);
-cond_syscall(sys_inotify_add_watch);
-cond_syscall(sys_inotify_rm_watch);
-cond_syscall(sys_migrate_pages);
-cond_syscall(sys_move_pages);
-cond_syscall(sys_chown16);
-cond_syscall(sys_fchown16);
-cond_syscall(sys_getegid16);
-cond_syscall(sys_geteuid16);
-cond_syscall(sys_getgid16);
-cond_syscall(sys_getgroups16);
-cond_syscall(sys_getresgid16);
-cond_syscall(sys_getresuid16);
-cond_syscall(sys_getuid16);
-cond_syscall(sys_lchown16);
-cond_syscall(sys_setfsgid16);
-cond_syscall(sys_setfsuid16);
-cond_syscall(sys_setgid16);
-cond_syscall(sys_setgroups16);
-cond_syscall(sys_setregid16);
-cond_syscall(sys_setresgid16);
-cond_syscall(sys_setresuid16);
-cond_syscall(sys_setreuid16);
-cond_syscall(sys_setuid16);
-cond_syscall(sys_sgetmask);
-cond_syscall(sys_ssetmask);
-cond_syscall(sys_vm86old);
-cond_syscall(sys_vm86);
-cond_syscall(sys_modify_ldt);
-cond_syscall(sys_ipc);
-cond_syscall(compat_sys_ipc);
-cond_syscall(compat_sys_sysctl);
-cond_syscall(sys_flock);
-cond_syscall(sys_io_setup);
-cond_syscall(sys_io_destroy);
-cond_syscall(sys_io_submit);
-cond_syscall(sys_io_cancel);
-cond_syscall(sys_io_getevents);
-cond_syscall(compat_sys_io_setup);
-cond_syscall(compat_sys_io_submit);
-cond_syscall(compat_sys_io_getevents);
-cond_syscall(sys_sysfs);
-cond_syscall(sys_syslog);
-cond_syscall(sys_process_vm_readv);
-cond_syscall(sys_process_vm_writev);
-cond_syscall(compat_sys_process_vm_readv);
-cond_syscall(compat_sys_process_vm_writev);
-cond_syscall(sys_uselib);
-cond_syscall(sys_fadvise64);
-cond_syscall(sys_fadvise64_64);
-cond_syscall(sys_madvise);
-cond_syscall(sys_setuid);
-cond_syscall(sys_setregid);
-cond_syscall(sys_setgid);
-cond_syscall(sys_setreuid);
-cond_syscall(sys_setresuid);
-cond_syscall(sys_getresuid);
-cond_syscall(sys_setresgid);
-cond_syscall(sys_getresgid);
-cond_syscall(sys_setgroups);
-cond_syscall(sys_getgroups);
-cond_syscall(sys_setfsuid);
-cond_syscall(sys_setfsgid);
-cond_syscall(sys_capget);
-cond_syscall(sys_capset);
-cond_syscall(sys_copy_file_range);
-
-/* arch-specific weak syscall entries */
-cond_syscall(sys_pciconfig_read);
-cond_syscall(sys_pciconfig_write);
-cond_syscall(sys_pciconfig_iobase);
-cond_syscall(compat_sys_s390_ipc);
-cond_syscall(ppc_rtas);
-cond_syscall(sys_spu_run);
-cond_syscall(sys_spu_create);
-cond_syscall(sys_subpage_prot);
-cond_syscall(sys_s390_pci_mmio_read);
-cond_syscall(sys_s390_pci_mmio_write);
-
-/* mmu depending weak syscall entries */
-cond_syscall(sys_mprotect);
-cond_syscall(sys_msync);
-cond_syscall(sys_mlock);
-cond_syscall(sys_munlock);
-cond_syscall(sys_mlockall);
-cond_syscall(sys_munlockall);
-cond_syscall(sys_mlock2);
-cond_syscall(sys_mincore);
-cond_syscall(sys_madvise);
-cond_syscall(sys_mremap);
-cond_syscall(sys_remap_file_pages);
-cond_syscall(compat_sys_move_pages);
-cond_syscall(compat_sys_migrate_pages);
-
-/* block-layer dependent */
-cond_syscall(sys_bdflush);
-cond_syscall(sys_ioprio_set);
-cond_syscall(sys_ioprio_get);
-
-/* New file descriptors */
-cond_syscall(sys_signalfd);
-cond_syscall(sys_signalfd4);
-cond_syscall(compat_sys_signalfd);
-cond_syscall(compat_sys_signalfd4);
-cond_syscall(sys_timerfd_create);
-cond_syscall(sys_timerfd_settime);
-cond_syscall(sys_timerfd_gettime);
-cond_syscall(compat_sys_timerfd_settime);
-cond_syscall(compat_sys_timerfd_gettime);
-cond_syscall(sys_eventfd);
-cond_syscall(sys_eventfd2);
-cond_syscall(sys_memfd_create);
-cond_syscall(sys_userfaultfd);
-
-/* performance counters: */
-cond_syscall(sys_perf_event_open);
-
-/* fanotify! */
-cond_syscall(sys_fanotify_init);
-cond_syscall(sys_fanotify_mark);
-cond_syscall(compat_sys_fanotify_mark);
+#ifndef COND_SYSCALL
+#define COND_SYSCALL(name) cond_syscall(sys_##name)
+#endif /* COND_SYSCALL */
+
+#ifndef COND_SYSCALL_COMPAT
+#define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name)
+#endif /* COND_SYSCALL_COMPAT */
+
+/*
+ * This list is kept in the same order as include/uapi/asm-generic/unistd.h.
+ * Architecture specific entries go below, followed by deprecated or obsolete
+ * system calls.
+ */
+
+COND_SYSCALL(io_setup);
+COND_SYSCALL_COMPAT(io_setup);
+COND_SYSCALL(io_destroy);
+COND_SYSCALL(io_submit);
+COND_SYSCALL_COMPAT(io_submit);
+COND_SYSCALL(io_cancel);
+COND_SYSCALL(io_getevents);
+COND_SYSCALL(io_pgetevents);
+COND_SYSCALL_COMPAT(io_getevents);
+COND_SYSCALL_COMPAT(io_pgetevents);
+
+/* fs/xattr.c */
+
+/* fs/dcache.c */
+
+/* fs/cookies.c */
+COND_SYSCALL(lookup_dcookie);
+COND_SYSCALL_COMPAT(lookup_dcookie);
+
+/* fs/eventfd.c */
+COND_SYSCALL(eventfd2);
+
+/* fs/eventfd.c */
+COND_SYSCALL(epoll_create1);
+COND_SYSCALL(epoll_ctl);
+COND_SYSCALL(epoll_pwait);
+COND_SYSCALL_COMPAT(epoll_pwait);
+
+/* fs/fcntl.c */
+
+/* fs/inotify_user.c */
+COND_SYSCALL(inotify_init1);
+COND_SYSCALL(inotify_add_watch);
+COND_SYSCALL(inotify_rm_watch);
+
+/* fs/ioctl.c */
+
+/* fs/ioprio.c */
+COND_SYSCALL(ioprio_set);
+COND_SYSCALL(ioprio_get);
+
+/* fs/locks.c */
+COND_SYSCALL(flock);
+
+/* fs/namei.c */
+
+/* fs/namespace.c */
+
+/* fs/nfsctl.c */
+
+/* fs/open.c */
+
+/* fs/pipe.c */
+
+/* fs/quota.c */
+COND_SYSCALL(quotactl);
+
+/* fs/readdir.c */
+
+/* fs/read_write.c */
+
+/* fs/sendfile.c */
+
+/* fs/select.c */
+
+/* fs/signalfd.c */
+COND_SYSCALL(signalfd4);
+COND_SYSCALL_COMPAT(signalfd4);
+
+/* fs/splice.c */
+
+/* fs/stat.c */
+
+/* fs/sync.c */
+
+/* fs/timerfd.c */
+COND_SYSCALL(timerfd_create);
+COND_SYSCALL(timerfd_settime);
+COND_SYSCALL_COMPAT(timerfd_settime);
+COND_SYSCALL(timerfd_gettime);
+COND_SYSCALL_COMPAT(timerfd_gettime);
+
+/* fs/utimes.c */
+
+/* kernel/acct.c */
+COND_SYSCALL(acct);
+
+/* kernel/capability.c */
+COND_SYSCALL(capget);
+COND_SYSCALL(capset);
+
+/* kernel/exec_domain.c */
+
+/* kernel/exit.c */
+
+/* kernel/fork.c */
+
+/* kernel/futex.c */
+COND_SYSCALL(futex);
+COND_SYSCALL_COMPAT(futex);
+COND_SYSCALL(set_robust_list);
+COND_SYSCALL_COMPAT(set_robust_list);
+COND_SYSCALL(get_robust_list);
+COND_SYSCALL_COMPAT(get_robust_list);
+
+/* kernel/hrtimer.c */
+
+/* kernel/itimer.c */
+
+/* kernel/kexec.c */
+COND_SYSCALL(kexec_load);
+COND_SYSCALL_COMPAT(kexec_load);
+
+/* kernel/module.c */
+COND_SYSCALL(init_module);
+COND_SYSCALL(delete_module);
+
+/* kernel/posix-timers.c */
+
+/* kernel/printk.c */
+COND_SYSCALL(syslog);
+
+/* kernel/ptrace.c */
+
+/* kernel/sched/core.c */
+
+/* kernel/signal.c */
+
+/* kernel/sys.c */
+COND_SYSCALL(setregid);
+COND_SYSCALL(setgid);
+COND_SYSCALL(setreuid);
+COND_SYSCALL(setuid);
+COND_SYSCALL(setresuid);
+COND_SYSCALL(getresuid);
+COND_SYSCALL(setresgid);
+COND_SYSCALL(getresgid);
+COND_SYSCALL(setfsuid);
+COND_SYSCALL(setfsgid);
+COND_SYSCALL(setgroups);
+COND_SYSCALL(getgroups);
+
+/* kernel/time.c */
+
+/* kernel/timer.c */
+
+/* ipc/mqueue.c */
+COND_SYSCALL(mq_open);
+COND_SYSCALL_COMPAT(mq_open);
+COND_SYSCALL(mq_unlink);
+COND_SYSCALL(mq_timedsend);
+COND_SYSCALL_COMPAT(mq_timedsend);
+COND_SYSCALL(mq_timedreceive);
+COND_SYSCALL_COMPAT(mq_timedreceive);
+COND_SYSCALL(mq_notify);
+COND_SYSCALL_COMPAT(mq_notify);
+COND_SYSCALL(mq_getsetattr);
+COND_SYSCALL_COMPAT(mq_getsetattr);
+
+/* ipc/msg.c */
+COND_SYSCALL(msgget);
+COND_SYSCALL(msgctl);
+COND_SYSCALL_COMPAT(msgctl);
+COND_SYSCALL(msgrcv);
+COND_SYSCALL_COMPAT(msgrcv);
+COND_SYSCALL(msgsnd);
+COND_SYSCALL_COMPAT(msgsnd);
+
+/* ipc/sem.c */
+COND_SYSCALL(semget);
+COND_SYSCALL(semctl);
+COND_SYSCALL_COMPAT(semctl);
+COND_SYSCALL(semtimedop);
+COND_SYSCALL_COMPAT(semtimedop);
+COND_SYSCALL(semop);
+
+/* ipc/shm.c */
+COND_SYSCALL(shmget);
+COND_SYSCALL(shmctl);
+COND_SYSCALL_COMPAT(shmctl);
+COND_SYSCALL(shmat);
+COND_SYSCALL_COMPAT(shmat);
+COND_SYSCALL(shmdt);
+
+/* net/socket.c */
+COND_SYSCALL(socket);
+COND_SYSCALL(socketpair);
+COND_SYSCALL(bind);
+COND_SYSCALL(listen);
+COND_SYSCALL(accept);
+COND_SYSCALL(connect);
+COND_SYSCALL(getsockname);
+COND_SYSCALL(getpeername);
+COND_SYSCALL(setsockopt);
+COND_SYSCALL_COMPAT(setsockopt);
+COND_SYSCALL(getsockopt);
+COND_SYSCALL_COMPAT(getsockopt);
+COND_SYSCALL(sendto);
+COND_SYSCALL(shutdown);
+COND_SYSCALL(recvfrom);
+COND_SYSCALL_COMPAT(recvfrom);
+COND_SYSCALL(sendmsg);
+COND_SYSCALL_COMPAT(sendmsg);
+COND_SYSCALL(recvmsg);
+COND_SYSCALL_COMPAT(recvmsg);
+
+/* mm/filemap.c */
+
+/* mm/nommu.c, also with MMU */
+COND_SYSCALL(mremap);
+
+/* security/keys/keyctl.c */
+COND_SYSCALL(add_key);
+COND_SYSCALL(request_key);
+COND_SYSCALL(keyctl);
+COND_SYSCALL_COMPAT(keyctl);
+
+/* arch/example/kernel/sys_example.c */
+
+/* mm/fadvise.c */
+COND_SYSCALL(fadvise64_64);
+
+/* mm/, CONFIG_MMU only */
+COND_SYSCALL(swapon);
+COND_SYSCALL(swapoff);
+COND_SYSCALL(mprotect);
+COND_SYSCALL(msync);
+COND_SYSCALL(mlock);
+COND_SYSCALL(munlock);
+COND_SYSCALL(mlockall);
+COND_SYSCALL(munlockall);
+COND_SYSCALL(mincore);
+COND_SYSCALL(madvise);
+COND_SYSCALL(remap_file_pages);
+COND_SYSCALL(mbind);
+COND_SYSCALL_COMPAT(mbind);
+COND_SYSCALL(get_mempolicy);
+COND_SYSCALL_COMPAT(get_mempolicy);
+COND_SYSCALL(set_mempolicy);
+COND_SYSCALL_COMPAT(set_mempolicy);
+COND_SYSCALL(migrate_pages);
+COND_SYSCALL_COMPAT(migrate_pages);
+COND_SYSCALL(move_pages);
+COND_SYSCALL_COMPAT(move_pages);
+
+COND_SYSCALL(perf_event_open);
+COND_SYSCALL(accept4);
+COND_SYSCALL(recvmmsg);
+COND_SYSCALL_COMPAT(recvmmsg);
+
+/*
+ * Architecture specific syscalls: see further below
+ */
+
+/* fanotify */
+COND_SYSCALL(fanotify_init);
+COND_SYSCALL(fanotify_mark);
/* open by handle */
-cond_syscall(sys_name_to_handle_at);
-cond_syscall(sys_open_by_handle_at);
-cond_syscall(compat_sys_open_by_handle_at);
+COND_SYSCALL(name_to_handle_at);
+COND_SYSCALL(open_by_handle_at);
+COND_SYSCALL_COMPAT(open_by_handle_at);
+
+COND_SYSCALL(sendmmsg);
+COND_SYSCALL_COMPAT(sendmmsg);
+COND_SYSCALL(process_vm_readv);
+COND_SYSCALL_COMPAT(process_vm_readv);
+COND_SYSCALL(process_vm_writev);
+COND_SYSCALL_COMPAT(process_vm_writev);
/* compare kernel pointers */
-cond_syscall(sys_kcmp);
+COND_SYSCALL(kcmp);
+
+COND_SYSCALL(finit_module);
/* operate on Secure Computing state */
-cond_syscall(sys_seccomp);
+COND_SYSCALL(seccomp);
+
+COND_SYSCALL(memfd_create);
/* access BPF programs and maps */
-cond_syscall(sys_bpf);
+COND_SYSCALL(bpf);
/* execveat */
-cond_syscall(sys_execveat);
+COND_SYSCALL(execveat);
+
+COND_SYSCALL(userfaultfd);
/* membarrier */
-cond_syscall(sys_membarrier);
+COND_SYSCALL(membarrier);
+
+COND_SYSCALL(mlock2);
+
+COND_SYSCALL(copy_file_range);
/* memory protection keys */
-cond_syscall(sys_pkey_mprotect);
-cond_syscall(sys_pkey_alloc);
-cond_syscall(sys_pkey_free);
+COND_SYSCALL(pkey_mprotect);
+COND_SYSCALL(pkey_alloc);
+COND_SYSCALL(pkey_free);
+
+
+/*
+ * Architecture specific weak syscall entries.
+ */
+
+/* pciconfig: alpha, arm, arm64, ia64, sparc */
+COND_SYSCALL(pciconfig_read);
+COND_SYSCALL(pciconfig_write);
+COND_SYSCALL(pciconfig_iobase);
+
+/* sys_socketcall: arm, mips, x86, ... */
+COND_SYSCALL(socketcall);
+COND_SYSCALL_COMPAT(socketcall);
+
+/* compat syscalls for arm64, x86, ... */
+COND_SYSCALL_COMPAT(sysctl);
+COND_SYSCALL_COMPAT(fanotify_mark);
+
+/* x86 */
+COND_SYSCALL(vm86old);
+COND_SYSCALL(modify_ldt);
+COND_SYSCALL_COMPAT(quotactl32);
+COND_SYSCALL(vm86);
+COND_SYSCALL(kexec_file_load);
+
+/* s390 */
+COND_SYSCALL(s390_pci_mmio_read);
+COND_SYSCALL(s390_pci_mmio_write);
+COND_SYSCALL_COMPAT(s390_ipc);
+
+/* powerpc */
+cond_syscall(ppc_rtas);
+COND_SYSCALL(spu_run);
+COND_SYSCALL(spu_create);
+COND_SYSCALL(subpage_prot);
+
+
+/*
+ * Deprecated system calls which are still defined in
+ * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch
+ */
+
+/* __ARCH_WANT_SYSCALL_NO_FLAGS */
+COND_SYSCALL(epoll_create);
+COND_SYSCALL(inotify_init);
+COND_SYSCALL(eventfd);
+COND_SYSCALL(signalfd);
+COND_SYSCALL_COMPAT(signalfd);
+
+/* __ARCH_WANT_SYSCALL_OFF_T */
+COND_SYSCALL(fadvise64);
+
+/* __ARCH_WANT_SYSCALL_DEPRECATED */
+COND_SYSCALL(epoll_wait);
+COND_SYSCALL(recv);
+COND_SYSCALL_COMPAT(recv);
+COND_SYSCALL(send);
+COND_SYSCALL(bdflush);
+COND_SYSCALL(uselib);
+
+
+/*
+ * The syscalls below are not found in include/uapi/asm-generic/unistd.h
+ */
+
+/* obsolete: SGETMASK_SYSCALL */
+COND_SYSCALL(sgetmask);
+COND_SYSCALL(ssetmask);
+
+/* obsolete: SYSFS_SYSCALL */
+COND_SYSCALL(sysfs);
+
+/* obsolete: __ARCH_WANT_SYS_IPC */
+COND_SYSCALL(ipc);
+COND_SYSCALL_COMPAT(ipc);
+
+/* obsolete: UID16 */
+COND_SYSCALL(chown16);
+COND_SYSCALL(fchown16);
+COND_SYSCALL(getegid16);
+COND_SYSCALL(geteuid16);
+COND_SYSCALL(getgid16);
+COND_SYSCALL(getgroups16);
+COND_SYSCALL(getresgid16);
+COND_SYSCALL(getresuid16);
+COND_SYSCALL(getuid16);
+COND_SYSCALL(lchown16);
+COND_SYSCALL(setfsgid16);
+COND_SYSCALL(setfsuid16);
+COND_SYSCALL(setgid16);
+COND_SYSCALL(setgroups16);
+COND_SYSCALL(setregid16);
+COND_SYSCALL(setresgid16);
+COND_SYSCALL(setresuid16);
+COND_SYSCALL(setreuid16);
+COND_SYSCALL(setuid16);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f98f28c12020..6a78cf70761d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -253,6 +253,10 @@ extern struct ctl_table random_table[];
extern struct ctl_table epoll_table[];
#endif
+#ifdef CONFIG_FW_LOADER_USER_HELPER
+extern struct ctl_table firmware_config_table[];
+#endif
+
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
int sysctl_legacy_va_layout;
#endif
@@ -748,6 +752,13 @@ static struct ctl_table kern_table[] = {
.mode = 0555,
.child = usermodehelper_table,
},
+#ifdef CONFIG_FW_LOADER_USER_HELPER
+ {
+ .procname = "firmware_config",
+ .mode = 0555,
+ .child = firmware_config_table,
+ },
+#endif
{
.procname = "overflowuid",
.data = &overflowuid,
@@ -1329,7 +1340,7 @@ static struct ctl_table vm_table[] = {
{
.procname = "dirtytime_expire_seconds",
.data = &dirtytime_expire_interval,
- .maxlen = sizeof(dirty_expire_interval),
+ .maxlen = sizeof(dirtytime_expire_interval),
.mode = 0644,
.proc_handler = dirtytime_interval_handler,
.extra1 = &zero,
@@ -2500,6 +2511,15 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
}
#endif
+/**
+ * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_dointvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_dointvec_minmax() handler.
+ */
struct do_proc_dointvec_minmax_conv_param {
int *min;
int *max;
@@ -2543,7 +2563,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
* This routine will ensure the values are within the range specified by
* table->extra1 (min) and table->extra2 (max).
*
- * Returns 0 on success.
+ * Returns 0 on success or -EINVAL on write when the range check fails.
*/
int proc_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2556,6 +2576,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
do_proc_dointvec_minmax_conv, &param);
}
+/**
+ * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_douintvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_douintvec_minmax() handler.
+ */
struct do_proc_douintvec_minmax_conv_param {
unsigned int *min;
unsigned int *max;
@@ -2603,7 +2632,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
* check for UINT_MAX to avoid having to support wrap around uses from
* userspace.
*
- * Returns 0 on success.
+ * Returns 0 on success or -ERANGE on write when the range check fails.
*/
int proc_douintvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e8c0dab4fd65..07148b497451 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -704,24 +704,6 @@ static const struct bin_table bin_net_netfilter_table[] = {
{}
};
-static const struct bin_table bin_net_irda_table[] = {
- { CTL_INT, NET_IRDA_DISCOVERY, "discovery" },
- { CTL_STR, NET_IRDA_DEVNAME, "devname" },
- { CTL_INT, NET_IRDA_DEBUG, "debug" },
- { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" },
- { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
- { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
- { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
- { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
- { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
- { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
- { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
- { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
- { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
- { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
- {}
-};
-
static const struct bin_table bin_net_table[] = {
{ CTL_DIR, NET_CORE, "core", bin_net_core_table },
/* NET_ETHER not used */
@@ -743,7 +725,7 @@ static const struct bin_table bin_net_table[] = {
{ CTL_DIR, NET_LLC, "llc", bin_net_llc_table },
{ CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table },
/* NET_DCCP "dccp" no longer used */
- { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table },
+ /* NET_IRDA "irda" no longer used */
{ CTL_INT, 2089, "nf_conntrack_max" },
{}
};
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f6b5f19223d6..78eabc41eaa6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -113,16 +113,6 @@ config NO_HZ_FULL
endchoice
-config NO_HZ_FULL_ALL
- bool "Full dynticks system on all CPUs by default (except CPU 0)"
- depends on NO_HZ_FULL
- help
- If the user doesn't pass the nohz_full boot option to
- define the range of full dynticks CPUs, consider that all
- CPUs in the system are full dynticks by default.
- Note the boot CPU will still be kept outside the range to
- handle the timekeeping duty.
-
config NO_HZ
bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ec09ce9a6012..639321bf2e39 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -326,6 +326,17 @@ static int alarmtimer_resume(struct device *dev)
}
#endif
+static void
+__alarm_init(struct alarm *alarm, enum alarmtimer_type type,
+ enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+{
+ timerqueue_init(&alarm->node);
+ alarm->timer.function = alarmtimer_fired;
+ alarm->function = function;
+ alarm->type = type;
+ alarm->state = ALARMTIMER_STATE_INACTIVE;
+}
+
/**
* alarm_init - Initialize an alarm structure
* @alarm: ptr to alarm to be initialized
@@ -335,13 +346,9 @@ static int alarmtimer_resume(struct device *dev)
void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
{
- timerqueue_init(&alarm->node);
hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
- HRTIMER_MODE_ABS);
- alarm->timer.function = alarmtimer_fired;
- alarm->function = function;
- alarm->type = type;
- alarm->state = ALARMTIMER_STATE_INACTIVE;
+ HRTIMER_MODE_ABS);
+ __alarm_init(alarm, type, function);
}
EXPORT_SYMBOL_GPL(alarm_init);
@@ -719,6 +726,8 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
__set_current_state(TASK_RUNNING);
+ destroy_hrtimer_on_stack(&alarm->timer);
+
if (!alarm->data)
return 0;
@@ -740,6 +749,15 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
return -ERESTART_RESTARTBLOCK;
}
+static void
+alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type,
+ enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+{
+ hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid,
+ HRTIMER_MODE_ABS);
+ __alarm_init(alarm, type, function);
+}
+
/**
* alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
* @restart: ptr to restart block
@@ -752,7 +770,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
ktime_t exp = restart->nanosleep.expires;
struct alarm alarm;
- alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+ alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup);
return alarmtimer_do_nsleep(&alarm, exp, type);
}
@@ -784,7 +802,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
- alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+ alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup);
exp = timespec64_to_ktime(*tsreq);
/* Convert (if necessary) to absolute time */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 65f9e3f24dde..f89a78e2792b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -119,8 +119,15 @@ static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
-static int clocksource_watchdog_kthread(void *data);
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
+static void inline clocksource_watchdog_lock(unsigned long *flags)
+{
+ spin_lock_irqsave(&watchdog_lock, *flags);
+}
+
+static void inline clocksource_watchdog_unlock(unsigned long *flags)
+{
+ spin_unlock_irqrestore(&watchdog_lock, *flags);
+}
/*
* Interval: 0.5sec Threshold: 0.0625s
@@ -128,23 +135,24 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating);
#define WATCHDOG_INTERVAL (HZ >> 1)
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
-static void clocksource_watchdog_work(struct work_struct *work)
-{
- /*
- * If kthread_run fails the next watchdog scan over the
- * watchdog_list will find the unstable clock again.
- */
- kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
-}
-
static void __clocksource_unstable(struct clocksource *cs)
{
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
+ /*
+ * If the clocksource is registered clocksource_watchdog_work() will
+ * re-rate and re-select.
+ */
+ if (list_empty(&cs->list)) {
+ cs->rating = 0;
+ return;
+ }
+
if (cs->mark_unstable)
cs->mark_unstable(cs);
+ /* kick clocksource_watchdog_work() */
if (finished_booting)
schedule_work(&watchdog_work);
}
@@ -153,10 +161,8 @@ static void __clocksource_unstable(struct clocksource *cs)
* clocksource_mark_unstable - mark clocksource unstable via watchdog
* @cs: clocksource to be marked unstable
*
- * This function is called instead of clocksource_change_rating from
- * cpu hotplug code to avoid a deadlock between the clocksource mutex
- * and the cpu hotplug mutex. It defers the update of the clocksource
- * to the watchdog thread.
+ * This function is called by the x86 TSC code to mark clocksources as unstable;
+ * it defers demotion and re-selection to a work.
*/
void clocksource_mark_unstable(struct clocksource *cs)
{
@@ -164,7 +170,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_lock_irqsave(&watchdog_lock, flags);
if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
- if (list_empty(&cs->wd_list))
+ if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
list_add(&cs->wd_list, &watchdog_list);
__clocksource_unstable(cs);
}
@@ -319,9 +325,8 @@ static void clocksource_resume_watchdog(void)
static void clocksource_enqueue_watchdog(struct clocksource *cs)
{
- unsigned long flags;
+ INIT_LIST_HEAD(&cs->wd_list);
- spin_lock_irqsave(&watchdog_lock, flags);
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
/* cs is a clocksource to be watched. */
list_add(&cs->wd_list, &watchdog_list);
@@ -331,7 +336,6 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
- spin_unlock_irqrestore(&watchdog_lock, flags);
}
static void clocksource_select_watchdog(bool fallback)
@@ -373,9 +377,6 @@ static void clocksource_select_watchdog(bool fallback)
static void clocksource_dequeue_watchdog(struct clocksource *cs)
{
- unsigned long flags;
-
- spin_lock_irqsave(&watchdog_lock, flags);
if (cs != watchdog) {
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
/* cs is a watched clocksource. */
@@ -384,21 +385,21 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs)
clocksource_stop_watchdog();
}
}
- spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static int __clocksource_watchdog_kthread(void)
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
+static int __clocksource_watchdog_work(void)
{
struct clocksource *cs, *tmp;
unsigned long flags;
- LIST_HEAD(unstable);
int select = 0;
spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
list_del_init(&cs->wd_list);
- list_add(&cs->wd_list, &unstable);
+ __clocksource_change_rating(cs, 0);
select = 1;
}
if (cs->flags & CLOCK_SOURCE_RESELECT) {
@@ -410,21 +411,15 @@ static int __clocksource_watchdog_kthread(void)
clocksource_stop_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
- /* Needs to be done outside of watchdog lock */
- list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
- list_del_init(&cs->wd_list);
- __clocksource_change_rating(cs, 0);
- }
return select;
}
-static int clocksource_watchdog_kthread(void *data)
+static void clocksource_watchdog_work(struct work_struct *work)
{
mutex_lock(&clocksource_mutex);
- if (__clocksource_watchdog_kthread())
+ if (__clocksource_watchdog_work())
clocksource_select();
mutex_unlock(&clocksource_mutex);
- return 0;
}
static bool clocksource_is_watchdog(struct clocksource *cs)
@@ -443,10 +438,13 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
static void clocksource_select_watchdog(bool fallback) { }
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
-static inline int __clocksource_watchdog_kthread(void) { return 0; }
+static inline int __clocksource_watchdog_work(void) { return 0; }
static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
void clocksource_mark_unstable(struct clocksource *cs) { }
+static inline void clocksource_watchdog_lock(unsigned long *flags) { }
+static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
+
#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
/**
@@ -594,6 +592,9 @@ static void __clocksource_select(bool skipcur)
if (!best)
return;
+ if (!strlen(override_name))
+ goto found;
+
/* Check for the override clocksource. */
list_for_each_entry(cs, &clocksource_list, list) {
if (skipcur && cs == curr_clocksource)
@@ -625,6 +626,7 @@ static void __clocksource_select(bool skipcur)
break;
}
+found:
if (curr_clocksource != best && !timekeeping_notify(best)) {
pr_info("Switched to clocksource %s\n", best->name);
curr_clocksource = best;
@@ -670,7 +672,7 @@ static int __init clocksource_done_booting(void)
/*
* Run the watchdog first to eliminate unstable clock sources
*/
- __clocksource_watchdog_kthread();
+ __clocksource_watchdog_work();
clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
@@ -775,14 +777,19 @@ EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
*/
int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
+ unsigned long flags;
/* Initialize mult/shift and max_idle_ns */
__clocksource_update_freq_scale(cs, scale, freq);
/* Add clocksource to the clocksource list */
mutex_lock(&clocksource_mutex);
+
+ clocksource_watchdog_lock(&flags);
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
+ clocksource_watchdog_unlock(&flags);
+
clocksource_select();
clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
@@ -804,8 +811,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
*/
void clocksource_change_rating(struct clocksource *cs, int rating)
{
+ unsigned long flags;
+
mutex_lock(&clocksource_mutex);
+ clocksource_watchdog_lock(&flags);
__clocksource_change_rating(cs, rating);
+ clocksource_watchdog_unlock(&flags);
+
clocksource_select();
clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
@@ -817,6 +829,8 @@ EXPORT_SYMBOL(clocksource_change_rating);
*/
static int clocksource_unbind(struct clocksource *cs)
{
+ unsigned long flags;
+
if (clocksource_is_watchdog(cs)) {
/* Select and try to install a replacement watchdog. */
clocksource_select_watchdog(true);
@@ -830,8 +844,12 @@ static int clocksource_unbind(struct clocksource *cs)
if (curr_clocksource == cs)
return -EBUSY;
}
+
+ clocksource_watchdog_lock(&flags);
clocksource_dequeue_watchdog(cs);
list_del_init(&cs->list);
+ clocksource_watchdog_unlock(&flags);
+
return 0;
}
@@ -853,16 +871,16 @@ EXPORT_SYMBOL(clocksource_unregister);
#ifdef CONFIG_SYSFS
/**
- * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * current_clocksource_show - sysfs interface for current clocksource
* @dev: unused
* @attr: unused
* @buf: char buffer to be filled with clocksource list
*
* Provides sysfs interface for listing current clocksource.
*/
-static ssize_t
-sysfs_show_current_clocksources(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t current_clocksource_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
ssize_t count = 0;
@@ -891,7 +909,7 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
}
/**
- * sysfs_override_clocksource - interface for manually overriding clocksource
+ * current_clocksource_store - interface for manually overriding clocksource
* @dev: unused
* @attr: unused
* @buf: name of override clocksource
@@ -900,9 +918,9 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
* Takes input from sysfs interface for manually overriding the default
* clocksource selection.
*/
-static ssize_t sysfs_override_clocksource(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t current_clocksource_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
ssize_t ret;
@@ -916,9 +934,10 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
return ret;
}
+static DEVICE_ATTR_RW(current_clocksource);
/**
- * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource
+ * unbind_clocksource_store - interface for manually unbinding clocksource
* @dev: unused
* @attr: unused
* @buf: unused
@@ -926,7 +945,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
*
* Takes input from sysfs interface for manually unbinding a clocksource.
*/
-static ssize_t sysfs_unbind_clocksource(struct device *dev,
+static ssize_t unbind_clocksource_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
@@ -950,19 +969,19 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
return ret ? ret : count;
}
+static DEVICE_ATTR_WO(unbind_clocksource);
/**
- * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * available_clocksource_show - sysfs interface for listing clocksource
* @dev: unused
* @attr: unused
* @buf: char buffer to be filled with clocksource list
*
* Provides sysfs interface for listing registered clocksources
*/
-static ssize_t
-sysfs_show_available_clocksources(struct device *dev,
- struct device_attribute *attr,
- char *buf)
+static ssize_t available_clocksource_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
struct clocksource *src;
ssize_t count = 0;
@@ -986,17 +1005,15 @@ sysfs_show_available_clocksources(struct device *dev,
return count;
}
+static DEVICE_ATTR_RO(available_clocksource);
-/*
- * Sysfs setup bits:
- */
-static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
- sysfs_override_clocksource);
-
-static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource);
-
-static DEVICE_ATTR(available_clocksource, 0444,
- sysfs_show_available_clocksources, NULL);
+static struct attribute *clocksource_attrs[] = {
+ &dev_attr_current_clocksource.attr,
+ &dev_attr_unbind_clocksource.attr,
+ &dev_attr_available_clocksource.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(clocksource);
static struct bus_type clocksource_subsys = {
.name = "clocksource",
@@ -1006,6 +1023,7 @@ static struct bus_type clocksource_subsys = {
static struct device device_clocksource = {
.id = 0,
.bus = &clocksource_subsys,
+ .groups = clocksource_groups,
};
static int __init init_clocksource_sysfs(void)
@@ -1014,17 +1032,7 @@ static int __init init_clocksource_sysfs(void)
if (!error)
error = device_register(&device_clocksource);
- if (!error)
- error = device_create_file(
- &device_clocksource,
- &dev_attr_current_clocksource);
- if (!error)
- error = device_create_file(&device_clocksource,
- &dev_attr_unbind_clocksource);
- if (!error)
- error = device_create_file(
- &device_clocksource,
- &dev_attr_available_clocksource);
+
return error;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 23788100e214..055a4a728c00 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -490,6 +490,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
while ((base = __next_base((cpu_base), &(active))))
static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
+ const struct hrtimer *exclude,
unsigned int active,
ktime_t expires_next)
{
@@ -502,9 +503,22 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
+ if (timer == exclude) {
+ /* Get to the next timer in the queue. */
+ next = timerqueue_iterate_next(next);
+ if (!next)
+ continue;
+
+ timer = container_of(next, struct hrtimer, node);
+ }
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires < expires_next) {
expires_next = expires;
+
+ /* Skip cpu_base update if a timer is being excluded. */
+ if (exclude)
+ continue;
+
if (timer->is_soft)
cpu_base->softirq_next_timer = timer;
else
@@ -548,7 +562,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
cpu_base->softirq_next_timer = NULL;
- expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
+ expires_next = __hrtimer_next_event_base(cpu_base, NULL,
+ active, KTIME_MAX);
next_timer = cpu_base->softirq_next_timer;
}
@@ -556,7 +571,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
if (active_mask & HRTIMER_ACTIVE_HARD) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
cpu_base->next_timer = next_timer;
- expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
+ expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
+ expires_next);
}
return expires_next;
@@ -1202,6 +1218,39 @@ u64 hrtimer_get_next_event(void)
return expires;
}
+
+/**
+ * hrtimer_next_event_without - time until next expiry event w/o one timer
+ * @exclude: timer to exclude
+ *
+ * Returns the next expiry time over all timers except for the @exclude one or
+ * KTIME_MAX if none of them is pending.
+ */
+u64 hrtimer_next_event_without(const struct hrtimer *exclude)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ u64 expires = KTIME_MAX;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+ if (__hrtimer_hres_active(cpu_base)) {
+ unsigned int active;
+
+ if (!cpu_base->softirq_activated) {
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+ expires = __hrtimer_next_event_base(cpu_base, exclude,
+ active, KTIME_MAX);
+ }
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ expires = __hrtimer_next_event_base(cpu_base, exclude, active,
+ expires);
+ }
+
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
+ return expires;
+}
#endif
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -1710,8 +1759,10 @@ out:
return ret;
}
-SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
- struct timespec __user *, rmtp)
+#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+
+SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
+ struct __kernel_timespec __user *, rmtp)
{
struct timespec64 tu;
@@ -1726,7 +1777,9 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
-#ifdef CONFIG_COMPAT
+#endif
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8d70da1b9a0d..a09ded765f6c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,7 +31,7 @@
/* USER_HZ period (usecs): */
-unsigned long tick_usec = TICK_USEC;
+unsigned long tick_usec = USER_TICK_USEC;
/* SHIFTED_HZ period (nsecs): */
unsigned long tick_nsec;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2541bd89f20e..5a6251ac6f7a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1205,10 +1205,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
u64 *newval, u64 *oldval)
{
u64 now;
+ int ret;
WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
+ ret = cpu_timer_sample_group(clock_idx, tsk, &now);
- if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) {
+ if (oldval && ret != -EINVAL) {
/*
* We are setting itimer. The *oldval is absolute and we update
* it to be relative, *newval argument is relative and we update
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index b258bee13b02..26aa9569e24a 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -19,6 +19,11 @@
#include <linux/posix-timers.h>
#include <linux/compat.h>
+#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
+/* Architectures may override SYS_NI and COMPAT_SYS_NI */
+#include <asm/syscall_wrapper.h>
+#endif
+
asmlinkage long sys_ni_posix_timers(void)
{
pr_err_once("process %d (%s) attempted a POSIX timer syscall "
@@ -27,8 +32,13 @@ asmlinkage long sys_ni_posix_timers(void)
return -ENOSYS;
}
+#ifndef SYS_NI
#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
+#endif
+
+#ifndef COMPAT_SYS_NI
#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
+#endif
SYS_NI(timer_create);
SYS_NI(timer_gettime);
@@ -49,7 +59,7 @@ SYS_NI(alarm);
*/
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
- const struct timespec __user *, tp)
+ const struct __kernel_timespec __user *, tp)
{
struct timespec64 new_tp;
@@ -80,7 +90,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct timespec __user *, tp)
+ struct __kernel_timespec __user *, tp)
{
int ret;
struct timespec64 kernel_tp;
@@ -94,7 +104,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
return 0;
}
-SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
+SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp)
{
struct timespec64 rtn_tp = {
.tv_sec = 0,
@@ -114,8 +124,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us
}
SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
- const struct timespec __user *, rqtp,
- struct timespec __user *, rmtp)
+ const struct __kernel_timespec __user *, rqtp,
+ struct __kernel_timespec __user *, rmtp)
{
struct timespec64 t;
@@ -148,7 +158,9 @@ COMPAT_SYS_NI(timer_settime);
COMPAT_SYS_NI(timer_gettime);
COMPAT_SYS_NI(getitimer);
COMPAT_SYS_NI(setitimer);
+#endif
+#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 75043046914e..e08ce3f27447 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -50,6 +50,7 @@
#include <linux/export.h>
#include <linux/hashtable.h>
#include <linux/compat.h>
+#include <linux/nospec.h>
#include "timekeeping.h"
#include "posix-timers.h"
@@ -1039,7 +1040,7 @@ void exit_itimers(struct signal_struct *sig)
}
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
- const struct timespec __user *, tp)
+ const struct __kernel_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 new_tp;
@@ -1054,7 +1055,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct timespec __user *,tp)
+ struct __kernel_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 kernel_tp;
@@ -1095,7 +1096,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
}
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
- struct timespec __user *, tp)
+ struct __kernel_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 rtn_tp;
@@ -1112,7 +1113,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
return error;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
struct compat_timespec __user *, tp)
@@ -1147,6 +1148,10 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
return err;
}
+#endif
+
+#ifdef CONFIG_COMPAT
+
COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
struct compat_timex __user *, utp)
{
@@ -1171,6 +1176,10 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
return err;
}
+#endif
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+
COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
@@ -1202,8 +1211,8 @@ static int common_nsleep(const clockid_t which_clock, int flags,
}
SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
- const struct timespec __user *, rqtp,
- struct timespec __user *, rmtp)
+ const struct __kernel_timespec __user *, rqtp,
+ struct __kernel_timespec __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 t;
@@ -1226,7 +1235,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return kc->nsleep(which_clock, flags, &t);
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
+
COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
@@ -1251,6 +1261,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
return kc->nsleep(which_clock, flags, &t);
}
+
#endif
static const struct k_clock clock_realtime = {
@@ -1346,11 +1357,15 @@ static const struct k_clock * const posix_clocks[] = {
static const struct k_clock *clockid_to_kclock(const clockid_t id)
{
- if (id < 0)
+ clockid_t idx = id;
+
+ if (id < 0) {
return (id & CLOCKFD_MASK) == CLOCKFD ?
&clock_posix_dynamic : &clock_posix_cpu;
+ }
- if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id])
+ if (id >= ARRAY_SIZE(posix_clocks))
return NULL;
- return posix_clocks[id];
+
+ return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b398c2ea69b2..aa2094d5dd27 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -612,6 +612,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
now = ktime_get();
/* Find all expired events */
for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
+ /*
+ * Required for !SMP because for_each_cpu() reports
+ * unconditionally CPU0 as set on UP kernels.
+ */
+ if (!IS_ENABLED(CONFIG_SMP) &&
+ cpumask_empty(tick_broadcast_oneshot_mask))
+ break;
+
td = &per_cpu(tick_cpu_device, cpu);
if (td->evtdev->next_event <= now) {
cpumask_set_cpu(cpu, tmpmask);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 49edc1c4f3e6..b7005dd21ec1 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -277,7 +277,8 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
*/
return !curdev ||
newdev->rating > curdev->rating ||
- !cpumask_equal(curdev->cpumask, newdev->cpumask);
+ (!cpumask_equal(curdev->cpumask, newdev->cpumask) &&
+ !tick_check_percpu(curdev, newdev, smp_processor_id()));
}
/*
@@ -490,6 +491,7 @@ void tick_freeze(void)
if (tick_freeze_depth == num_online_cpus()) {
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
+ system_state = SYSTEM_SUSPEND;
timekeeping_suspend();
} else {
tick_suspend_local();
@@ -513,6 +515,7 @@ void tick_unfreeze(void)
if (tick_freeze_depth == num_online_cpus()) {
timekeeping_resume();
+ system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
} else {
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index c1f518e7aa80..6fe615d57ebb 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -82,16 +82,15 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
!tick_device_is_functional(dev)) {
- printk(KERN_INFO "Clockevents: "
- "could not switch to one-shot mode:");
+ pr_info("Clockevents: could not switch to one-shot mode:");
if (!dev) {
- printk(" no tick device\n");
+ pr_cont(" no tick device\n");
} else {
if (!tick_device_is_functional(dev))
- printk(" %s is not functional.\n", dev->name);
+ pr_cont(" %s is not functional.\n", dev->name);
else
- printk(" %s does not support one-shot mode.\n",
- dev->name);
+ pr_cont(" %s does not support one-shot mode.\n",
+ dev->name);
}
return -EINVAL;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 29a5733eff83..da9455a6b42b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -113,8 +113,7 @@ static ktime_t tick_init_jiffy_update(void)
return period;
}
-
-static void tick_sched_do_timer(ktime_t now)
+static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int cpu = smp_processor_id();
@@ -134,6 +133,9 @@ static void tick_sched_do_timer(ktime_t now)
/* Check, if the jiffies need an update */
if (tick_do_timer_cpu == cpu)
tick_do_update_jiffies64(now);
+
+ if (ts->inidle)
+ ts->got_idle_tick = 1;
}
static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
@@ -405,30 +407,12 @@ static int tick_nohz_cpu_down(unsigned int cpu)
return 0;
}
-static int tick_nohz_init_all(void)
-{
- int err = -1;
-
-#ifdef CONFIG_NO_HZ_FULL_ALL
- if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
- WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
- return err;
- }
- err = 0;
- cpumask_setall(tick_nohz_full_mask);
- tick_nohz_full_running = true;
-#endif
- return err;
-}
-
void __init tick_nohz_init(void)
{
int cpu, ret;
- if (!tick_nohz_full_running) {
- if (tick_nohz_init_all() < 0)
- return;
- }
+ if (!tick_nohz_full_running)
+ return;
/*
* Full dynticks uses irq work to drive the tick rescheduling on safe
@@ -481,9 +465,18 @@ static int __init setup_tick_nohz(char *str)
__setup("nohz=", setup_tick_nohz);
-int tick_nohz_tick_stopped(void)
+bool tick_nohz_tick_stopped(void)
{
- return __this_cpu_read(tick_cpu_sched.tick_stopped);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->tick_stopped;
+}
+
+bool tick_nohz_tick_stopped_cpu(int cpu)
+{
+ struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+ return ts->tick_stopped;
}
/**
@@ -539,14 +532,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
sched_clock_idle_wakeup_event();
}
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static void tick_nohz_start_idle(struct tick_sched *ts)
{
- ktime_t now = ktime_get();
-
- ts->idle_entrytime = now;
+ ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
sched_clock_idle_sleep_event();
- return now;
}
/**
@@ -655,13 +645,10 @@ static inline bool local_timer_softirq_pending(void)
return local_softirq_pending() & TIMER_SOFTIRQ;
}
-static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
- ktime_t now, int cpu)
+static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
- struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
unsigned long seq, basejiff;
- ktime_t tick;
/* Read jiffies and the time when jiffies were updated last */
do {
@@ -670,6 +657,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
basejiff = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
+ ts->timer_expires_base = basemono;
/*
* Keep the periodic tick, when RCU, architecture or irq_work
@@ -714,53 +702,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
* next period, so no point in stopping it either, bail.
*/
if (!ts->tick_stopped) {
- tick = 0;
+ ts->timer_expires = 0;
goto out;
}
}
/*
+ * If this CPU is the one which had the do_timer() duty last, we limit
+ * the sleep time to the timekeeping max_deferment value.
+ * Otherwise we can sleep as long as we want.
+ */
+ delta = timekeeping_max_deferment();
+ if (cpu != tick_do_timer_cpu &&
+ (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last))
+ delta = KTIME_MAX;
+
+ /* Calculate the next expiry time */
+ if (delta < (KTIME_MAX - basemono))
+ expires = basemono + delta;
+ else
+ expires = KTIME_MAX;
+
+ ts->timer_expires = min_t(u64, expires, next_tick);
+
+out:
+ return ts->timer_expires;
+}
+
+static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
+{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+ u64 basemono = ts->timer_expires_base;
+ u64 expires = ts->timer_expires;
+ ktime_t tick = expires;
+
+ /* Make sure we won't be trying to stop it twice in a row. */
+ ts->timer_expires_base = 0;
+
+ /*
* If this CPU is the one which updates jiffies, then give up
* the assignment and let it be taken by the CPU which runs
* the tick timer next, which might be this CPU as well. If we
* don't drop this here the jiffies might be stale and
* do_timer() never invoked. Keep track of the fact that it
- * was the one which had the do_timer() duty last. If this CPU
- * is the one which had the do_timer() duty last, we limit the
- * sleep time to the timekeeping max_deferment value.
- * Otherwise we can sleep as long as we want.
+ * was the one which had the do_timer() duty last.
*/
- delta = timekeeping_max_deferment();
if (cpu == tick_do_timer_cpu) {
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
ts->do_timer_last = 1;
} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
- delta = KTIME_MAX;
ts->do_timer_last = 0;
- } else if (!ts->do_timer_last) {
- delta = KTIME_MAX;
}
-#ifdef CONFIG_NO_HZ_FULL
- /* Limit the tick delta to the maximum scheduler deferment */
- if (!ts->inidle)
- delta = min(delta, scheduler_tick_max_deferment());
-#endif
-
- /* Calculate the next expiry time */
- if (delta < (KTIME_MAX - basemono))
- expires = basemono + delta;
- else
- expires = KTIME_MAX;
-
- expires = min_t(u64, expires, next_tick);
- tick = expires;
-
/* Skip reprogram of event if its not changed */
if (ts->tick_stopped && (expires == ts->next_tick)) {
/* Sanity check: make sure clockevent is actually programmed */
if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
- goto out;
+ return;
WARN_ON_ONCE(1);
printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
@@ -794,23 +792,31 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
if (unlikely(expires == KTIME_MAX)) {
if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
hrtimer_cancel(&ts->sched_timer);
- goto out;
+ return;
}
- hrtimer_set_expires(&ts->sched_timer, tick);
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+ } else {
+ hrtimer_set_expires(&ts->sched_timer, tick);
+ tick_program_event(tick, 1);
+ }
+}
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+static void tick_nohz_retain_tick(struct tick_sched *ts)
+{
+ ts->timer_expires_base = 0;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu)
+{
+ if (tick_nohz_next_event(ts, cpu))
+ tick_nohz_stop_tick(ts, cpu);
else
- tick_program_event(tick, 1);
-out:
- /*
- * Update the estimated sleep length until the next timer
- * (not only the tick).
- */
- ts->sleep_length = ktime_sub(dev->next_event, now);
- return tick;
+ tick_nohz_retain_tick(ts);
}
+#endif /* CONFIG_NO_HZ_FULL */
static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
@@ -847,7 +853,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
return;
if (can_stop_full_tick(cpu, ts))
- tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ tick_nohz_stop_sched_tick(ts, cpu);
else if (ts->tick_stopped)
tick_nohz_restart_sched_tick(ts, ktime_get());
#endif
@@ -873,10 +879,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
}
- if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
- ts->sleep_length = NSEC_PER_SEC / HZ;
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
return false;
- }
if (need_resched())
return false;
@@ -911,61 +915,80 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return true;
}
-static void __tick_nohz_idle_enter(struct tick_sched *ts)
+static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
{
- ktime_t now, expires;
+ ktime_t expires;
int cpu = smp_processor_id();
- now = tick_nohz_start_idle(ts);
+ /*
+ * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
+ * tick timer expiration time is known already.
+ */
+ if (ts->timer_expires_base)
+ expires = ts->timer_expires;
+ else if (can_stop_idle_tick(cpu, ts))
+ expires = tick_nohz_next_event(ts, cpu);
+ else
+ return;
- if (can_stop_idle_tick(cpu, ts)) {
+ ts->idle_calls++;
+
+ if (expires > 0LL) {
int was_stopped = ts->tick_stopped;
- ts->idle_calls++;
+ tick_nohz_stop_tick(ts, cpu);
- expires = tick_nohz_stop_sched_tick(ts, now, cpu);
- if (expires > 0LL) {
- ts->idle_sleeps++;
- ts->idle_expires = expires;
- }
+ ts->idle_sleeps++;
+ ts->idle_expires = expires;
if (!was_stopped && ts->tick_stopped) {
ts->idle_jiffies = ts->last_jiffies;
nohz_balance_enter_idle(cpu);
}
+ } else {
+ tick_nohz_retain_tick(ts);
}
}
/**
- * tick_nohz_idle_enter - stop the idle tick from the idle task
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
*
* When the next event is more than a tick into the future, stop the idle tick
- * Called when we start the idle loop.
- *
- * The arch is responsible of calling:
+ */
+void tick_nohz_idle_stop_tick(void)
+{
+ __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
+}
+
+void tick_nohz_idle_retain_tick(void)
+{
+ tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
+ /*
+ * Undo the effect of get_next_timer_interrupt() called from
+ * tick_nohz_next_event().
+ */
+ timer_clear_idle();
+}
+
+/**
+ * tick_nohz_idle_enter - prepare for entering idle on the current CPU
*
- * - rcu_idle_enter() after its last use of RCU before the CPU is put
- * to sleep.
- * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ * Called when we start the idle loop.
*/
void tick_nohz_idle_enter(void)
{
struct tick_sched *ts;
lockdep_assert_irqs_enabled();
- /*
- * Update the idle state in the scheduler domain hierarchy
- * when tick_nohz_stop_sched_tick() is called from the idle loop.
- * State will be updated to busy during the first busy tick after
- * exiting idle.
- */
- set_cpu_sd_state_idle();
local_irq_disable();
ts = this_cpu_ptr(&tick_cpu_sched);
+
+ WARN_ON_ONCE(ts->timer_expires_base);
+
ts->inidle = 1;
- __tick_nohz_idle_enter(ts);
+ tick_nohz_start_idle(ts);
local_irq_enable();
}
@@ -983,21 +1006,62 @@ void tick_nohz_irq_exit(void)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->inidle)
- __tick_nohz_idle_enter(ts);
+ tick_nohz_start_idle(ts);
else
tick_nohz_full_update_tick(ts);
}
/**
- * tick_nohz_get_sleep_length - return the length of the current sleep
+ * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
+ */
+bool tick_nohz_idle_got_tick(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (ts->got_idle_tick) {
+ ts->got_idle_tick = 0;
+ return true;
+ }
+ return false;
+}
+
+/**
+ * tick_nohz_get_sleep_length - return the expected length of the current sleep
+ * @delta_next: duration until the next event if the tick cannot be stopped
*
* Called from power state control code with interrupts disabled
*/
-ktime_t tick_nohz_get_sleep_length(void)
+ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ int cpu = smp_processor_id();
+ /*
+ * The idle entry time is expected to be a sufficient approximation of
+ * the current time at this point.
+ */
+ ktime_t now = ts->idle_entrytime;
+ ktime_t next_event;
+
+ WARN_ON_ONCE(!ts->inidle);
+
+ *delta_next = ktime_sub(dev->next_event, now);
- return ts->sleep_length;
+ if (!can_stop_idle_tick(cpu, ts))
+ return *delta_next;
+
+ next_event = tick_nohz_next_event(ts, cpu);
+ if (!next_event)
+ return *delta_next;
+
+ /*
+ * If the next highres timer to expire is earlier than next_event, the
+ * idle governor needs to know that.
+ */
+ next_event = min_t(u64, next_event,
+ hrtimer_next_event_without(&ts->sched_timer));
+
+ return ktime_sub(next_event, now);
}
/**
@@ -1046,6 +1110,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
#endif
}
+static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
+{
+ tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_account_idle_ticks(ts);
+}
+
+void tick_nohz_idle_restart_tick(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (ts->tick_stopped)
+ __tick_nohz_idle_restart_tick(ts, ktime_get());
+}
+
/**
* tick_nohz_idle_exit - restart the idle tick from the idle task
*
@@ -1056,24 +1134,26 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
void tick_nohz_idle_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ bool idle_active, tick_stopped;
ktime_t now;
local_irq_disable();
WARN_ON_ONCE(!ts->inidle);
+ WARN_ON_ONCE(ts->timer_expires_base);
ts->inidle = 0;
+ idle_active = ts->idle_active;
+ tick_stopped = ts->tick_stopped;
- if (ts->idle_active || ts->tick_stopped)
+ if (idle_active || tick_stopped)
now = ktime_get();
- if (ts->idle_active)
+ if (idle_active)
tick_nohz_stop_idle(ts, now);
- if (ts->tick_stopped) {
- tick_nohz_restart_sched_tick(ts, now);
- tick_nohz_account_idle_ticks(ts);
- }
+ if (tick_stopped)
+ __tick_nohz_idle_restart_tick(ts, now);
local_irq_enable();
}
@@ -1089,7 +1169,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
dev->next_event = KTIME_MAX;
- tick_sched_do_timer(now);
+ tick_sched_do_timer(ts, now);
tick_sched_handle(ts, regs);
/* No need to reprogram if we are running tickless */
@@ -1184,7 +1264,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
- tick_sched_do_timer(now);
+ tick_sched_do_timer(ts, now);
/*
* Do not call, when we are not in irq context and have
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 954b43dbf21c..6de959a854b2 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -38,31 +38,37 @@ enum tick_nohz_mode {
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @sleep_length: Duration of the current idle sleep
+ * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
+ * @timer_expires_base: Base time clock monotonic for @timer_expires
* @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @got_idle_tick: Tick timer function has run with @inidle set
*/
struct tick_sched {
struct hrtimer sched_timer;
unsigned long check_clocks;
enum tick_nohz_mode nohz_mode;
+
+ unsigned int inidle : 1;
+ unsigned int tick_stopped : 1;
+ unsigned int idle_active : 1;
+ unsigned int do_timer_last : 1;
+ unsigned int got_idle_tick : 1;
+
ktime_t last_tick;
ktime_t next_tick;
- int inidle;
- int tick_stopped;
unsigned long idle_jiffies;
unsigned long idle_calls;
unsigned long idle_sleeps;
- int idle_active;
ktime_t idle_entrytime;
ktime_t idle_waketime;
ktime_t idle_exittime;
ktime_t idle_sleeptime;
ktime_t iowait_sleeptime;
- ktime_t sleep_length;
unsigned long last_jiffies;
+ u64 timer_expires;
+ u64 timer_expires_base;
u64 next_timer;
ktime_t idle_expires;
- int do_timer_last;
atomic_t tick_dep_mask;
};
diff --git a/kernel/time/time.c b/kernel/time/time.c
index bd4e6c7dd689..6fa99213fc72 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -407,7 +407,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
}
EXPORT_SYMBOL(mktime64);
-#if __BITS_PER_LONG == 32
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
*
@@ -468,7 +467,6 @@ struct timespec ns_to_timespec(const s64 nsec)
return ts;
}
EXPORT_SYMBOL(ns_to_timespec);
-#endif
/**
* ns_to_timeval - Convert nanoseconds to timeval
@@ -488,6 +486,18 @@ struct timeval ns_to_timeval(const s64 nsec)
}
EXPORT_SYMBOL(ns_to_timeval);
+struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)
+{
+ struct timespec64 ts = ns_to_timespec64(nsec);
+ struct __kernel_old_timeval tv;
+
+ tv.tv_sec = ts.tv_sec;
+ tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000;
+
+ return tv;
+}
+EXPORT_SYMBOL(ns_to_kernel_old_timeval);
+
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
*
@@ -841,9 +851,9 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
}
int get_timespec64(struct timespec64 *ts,
- const struct timespec __user *uts)
+ const struct __kernel_timespec __user *uts)
{
- struct timespec kts;
+ struct __kernel_timespec kts;
int ret;
ret = copy_from_user(&kts, uts, sizeof(kts));
@@ -851,6 +861,11 @@ int get_timespec64(struct timespec64 *ts,
return -EFAULT;
ts->tv_sec = kts.tv_sec;
+
+ /* Zero out the padding for 32 bit systems or in compat mode */
+ if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()))
+ kts.tv_nsec &= 0xFFFFFFFFUL;
+
ts->tv_nsec = kts.tv_nsec;
return 0;
@@ -858,16 +873,61 @@ int get_timespec64(struct timespec64 *ts,
EXPORT_SYMBOL_GPL(get_timespec64);
int put_timespec64(const struct timespec64 *ts,
- struct timespec __user *uts)
+ struct __kernel_timespec __user *uts)
{
- struct timespec kts = {
+ struct __kernel_timespec kts = {
.tv_sec = ts->tv_sec,
.tv_nsec = ts->tv_nsec
};
+
return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(put_timespec64);
+int __compat_get_timespec64(struct timespec64 *ts64,
+ const struct compat_timespec __user *cts)
+{
+ struct compat_timespec ts;
+ int ret;
+
+ ret = copy_from_user(&ts, cts, sizeof(ts));
+ if (ret)
+ return -EFAULT;
+
+ ts64->tv_sec = ts.tv_sec;
+ ts64->tv_nsec = ts.tv_nsec;
+
+ return 0;
+}
+
+int __compat_put_timespec64(const struct timespec64 *ts64,
+ struct compat_timespec __user *cts)
+{
+ struct compat_timespec ts = {
+ .tv_sec = ts64->tv_sec,
+ .tv_nsec = ts64->tv_nsec
+ };
+ return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+
+int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_get_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec64);
+
+int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_put_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec64);
+
int get_itimerspec64(struct itimerspec64 *it,
const struct itimerspec __user *uit)
{
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cd03317e7b57..4786df904c22 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -332,6 +332,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
tk->tkr_mono.mult = clock->mult;
tk->tkr_raw.mult = clock->mult;
tk->ntp_err_mult = 0;
+ tk->skip_second_overflow = 0;
}
/* Timekeeper helper functions. */
@@ -704,18 +705,19 @@ static void timekeeping_forward_now(struct timekeeper *tk)
}
/**
- * __getnstimeofday64 - Returns the time of day in a timespec64.
+ * ktime_get_real_ts64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
*
- * Updates the time of day in the timespec.
- * Returns 0 on success, or -ve when suspended (timespec will be undefined).
+ * Returns the time of day in a timespec64 (WARN if suspended).
*/
-int __getnstimeofday64(struct timespec64 *ts)
+void ktime_get_real_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
u64 nsecs;
+ WARN_ON(timekeeping_suspended);
+
do {
seq = read_seqcount_begin(&tk_core.seq);
@@ -726,28 +728,8 @@ int __getnstimeofday64(struct timespec64 *ts)
ts->tv_nsec = 0;
timespec64_add_ns(ts, nsecs);
-
- /*
- * Do not bail out early, in case there were callers still using
- * the value, even in the face of the WARN_ON.
- */
- if (unlikely(timekeeping_suspended))
- return -EAGAIN;
- return 0;
-}
-EXPORT_SYMBOL(__getnstimeofday64);
-
-/**
- * getnstimeofday64 - Returns the time of day in a timespec64.
- * @ts: pointer to the timespec64 to be set
- *
- * Returns the time of day in a timespec64 (WARN if suspended).
- */
-void getnstimeofday64(struct timespec64 *ts)
-{
- WARN_ON(__getnstimeofday64(ts));
}
-EXPORT_SYMBOL(getnstimeofday64);
+EXPORT_SYMBOL(ktime_get_real_ts64);
ktime_t ktime_get(void)
{
@@ -813,6 +795,25 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
}
EXPORT_SYMBOL_GPL(ktime_get_with_offset);
+ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base, *offset = offsets[offs];
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = ktime_add(tk->tkr_mono.base, *offset);
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return base;
+
+}
+EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
+
/**
* ktime_mono_to_any() - convert mononotic time to any other time
* @tmono: time to convert.
@@ -1409,12 +1410,12 @@ int timekeeping_notify(struct clocksource *clock)
}
/**
- * getrawmonotonic64 - Returns the raw monotonic time in a timespec
+ * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
* @ts: pointer to the timespec64 to be set
*
* Returns the raw monotonic time (completely un-modified by ntp)
*/
-void getrawmonotonic64(struct timespec64 *ts)
+void ktime_get_raw_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
@@ -1430,7 +1431,7 @@ void getrawmonotonic64(struct timespec64 *ts)
ts->tv_nsec = 0;
timespec64_add_ns(ts, nsecs);
}
-EXPORT_SYMBOL(getrawmonotonic64);
+EXPORT_SYMBOL(ktime_get_raw_ts64);
/**
@@ -1799,20 +1800,19 @@ device_initcall(timekeeping_init_ops);
*/
static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
s64 offset,
- bool negative,
- int adj_scale)
+ s32 mult_adj)
{
s64 interval = tk->cycle_interval;
- s32 mult_adj = 1;
- if (negative) {
- mult_adj = -mult_adj;
+ if (mult_adj == 0) {
+ return;
+ } else if (mult_adj == -1) {
interval = -interval;
- offset = -offset;
+ offset = -offset;
+ } else if (mult_adj != 1) {
+ interval *= mult_adj;
+ offset *= mult_adj;
}
- mult_adj <<= adj_scale;
- interval <<= adj_scale;
- offset <<= adj_scale;
/*
* So the following can be confusing.
@@ -1860,8 +1860,6 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
* xtime_nsec_2 = xtime_nsec_1 - offset
* Which simplfies to:
* xtime_nsec -= offset
- *
- * XXX - TODO: Doc ntp_error calculation.
*/
if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
/* NTP adjustment caused clocksource mult overflow */
@@ -1872,89 +1870,38 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
tk->tkr_mono.mult += mult_adj;
tk->xtime_interval += interval;
tk->tkr_mono.xtime_nsec -= offset;
- tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
}
/*
- * Calculate the multiplier adjustment needed to match the frequency
- * specified by NTP
+ * Adjust the timekeeper's multiplier to the correct frequency
+ * and also to reduce the accumulated error value.
*/
-static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
- s64 offset)
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
- s64 interval = tk->cycle_interval;
- s64 xinterval = tk->xtime_interval;
- u32 base = tk->tkr_mono.clock->mult;
- u32 max = tk->tkr_mono.clock->maxadj;
- u32 cur_adj = tk->tkr_mono.mult;
- s64 tick_error;
- bool negative;
- u32 adj_scale;
-
- /* Remove any current error adj from freq calculation */
- if (tk->ntp_err_mult)
- xinterval -= tk->cycle_interval;
-
- tk->ntp_tick = ntp_tick_length();
-
- /* Calculate current error per tick */
- tick_error = ntp_tick_length() >> tk->ntp_error_shift;
- tick_error -= (xinterval + tk->xtime_remainder);
-
- /* Don't worry about correcting it if its small */
- if (likely((tick_error >= 0) && (tick_error <= interval)))
- return;
+ u32 mult;
- /* preserve the direction of correction */
- negative = (tick_error < 0);
-
- /* If any adjustment would pass the max, just return */
- if (negative && (cur_adj - 1) <= (base - max))
- return;
- if (!negative && (cur_adj + 1) >= (base + max))
- return;
/*
- * Sort out the magnitude of the correction, but
- * avoid making so large a correction that we go
- * over the max adjustment.
+ * Determine the multiplier from the current NTP tick length.
+ * Avoid expensive division when the tick length doesn't change.
*/
- adj_scale = 0;
- tick_error = abs(tick_error);
- while (tick_error > interval) {
- u32 adj = 1 << (adj_scale + 1);
-
- /* Check if adjustment gets us within 1 unit from the max */
- if (negative && (cur_adj - adj) <= (base - max))
- break;
- if (!negative && (cur_adj + adj) >= (base + max))
- break;
-
- adj_scale++;
- tick_error >>= 1;
+ if (likely(tk->ntp_tick == ntp_tick_length())) {
+ mult = tk->tkr_mono.mult - tk->ntp_err_mult;
+ } else {
+ tk->ntp_tick = ntp_tick_length();
+ mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
+ tk->xtime_remainder, tk->cycle_interval);
}
- /* scale the corrections */
- timekeeping_apply_adjustment(tk, offset, negative, adj_scale);
-}
+ /*
+ * If the clock is behind the NTP time, increase the multiplier by 1
+ * to catch up with it. If it's ahead and there was a remainder in the
+ * tick division, the clock will slow down. Otherwise it will stay
+ * ahead until the tick length changes to a non-divisible value.
+ */
+ tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
+ mult += tk->ntp_err_mult;
-/*
- * Adjust the timekeeper's multiplier to the correct frequency
- * and also to reduce the accumulated error value.
- */
-static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
-{
- /* Correct for the current frequency error */
- timekeeping_freqadjust(tk, offset);
-
- /* Next make a small adjustment to fix any cumulative error */
- if (!tk->ntp_err_mult && (tk->ntp_error > 0)) {
- tk->ntp_err_mult = 1;
- timekeeping_apply_adjustment(tk, offset, 0, 0);
- } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) {
- /* Undo any existing error adjustment */
- timekeeping_apply_adjustment(tk, offset, 1, 0);
- tk->ntp_err_mult = 0;
- }
+ timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);
if (unlikely(tk->tkr_mono.clock->maxadj &&
(abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
@@ -1971,18 +1918,15 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
* in the code above, its possible the required corrective factor to
* xtime_nsec could cause it to underflow.
*
- * Now, since we already accumulated the second, cannot simply roll
- * the accumulated second back, since the NTP subsystem has been
- * notified via second_overflow. So instead we push xtime_nsec forward
- * by the amount we underflowed, and add that amount into the error.
- *
- * We'll correct this error next time through this function, when
- * xtime_nsec is not as small.
+ * Now, since we have already accumulated the second and the NTP
+ * subsystem has been notified via second_overflow(), we need to skip
+ * the next update.
*/
if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
- s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
- tk->tkr_mono.xtime_nsec = 0;
- tk->ntp_error += neg << tk->ntp_error_shift;
+ tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
+ tk->tkr_mono.shift;
+ tk->xtime_sec--;
+ tk->skip_second_overflow = 1;
}
}
@@ -2005,6 +1949,15 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
tk->tkr_mono.xtime_nsec -= nsecps;
tk->xtime_sec++;
+ /*
+ * Skip NTP update if this second was accumulated before,
+ * i.e. xtime_nsec underflowed in timekeeping_adjust()
+ */
+ if (unlikely(tk->skip_second_overflow)) {
+ tk->skip_second_overflow = 0;
+ continue;
+ }
+
/* Figure out if its a leap sec and apply if needed */
leap = second_overflow(tk->xtime_sec);
if (unlikely(leap)) {
@@ -2121,7 +2074,7 @@ void update_wall_time(void)
shift--;
}
- /* correct the clock when NTP error is too big */
+ /* Adjust the multiplier to correct NTP error */
timekeeping_adjust(tk, offset);
/*
@@ -2180,30 +2133,20 @@ unsigned long get_seconds(void)
}
EXPORT_SYMBOL(get_seconds);
-struct timespec __current_kernel_time(void)
+void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
-
- return timespec64_to_timespec(tk_xtime(tk));
-}
-
-struct timespec64 current_kernel_time64(void)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 now;
unsigned long seq;
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tk_xtime(tk);
+ *ts = tk_xtime(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
-
- return now;
}
-EXPORT_SYMBOL(current_kernel_time64);
+EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
-struct timespec64 get_monotonic_coarse64(void)
+void ktime_get_coarse_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now, mono;
@@ -2216,12 +2159,10 @@ struct timespec64 get_monotonic_coarse64(void)
mono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
- set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
+ set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
-
- return now;
}
-EXPORT_SYMBOL(get_monotonic_coarse64);
+EXPORT_SYMBOL(ktime_get_coarse_ts64);
/*
* Must hold jiffies_lock
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index fdbeeb02dde9..cf5c0828ee31 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -31,6 +31,4 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
}
#endif
-extern time64_t __ktime_get_real_seconds(void);
-
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4a4fd567fb26..cc2d23e6ff61 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1251,18 +1251,18 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
*
* Note: For !irqsafe timers, you must not hold locks that are held in
* interrupt context while calling this function. Even if the lock has
- * nothing to do with the timer in question. Here's why:
+ * nothing to do with the timer in question. Here's why::
*
* CPU0 CPU1
* ---- ----
- * <SOFTIRQ>
- * call_timer_fn();
- * base->running_timer = mytimer;
- * spin_lock_irq(somelock);
+ * <SOFTIRQ>
+ * call_timer_fn();
+ * base->running_timer = mytimer;
+ * spin_lock_irq(somelock);
* <IRQ>
* spin_lock(somelock);
- * del_timer_sync(mytimer);
- * while (base->running_timer == mytimer);
+ * del_timer_sync(mytimer);
+ * while (base->running_timer == mytimer);
*
* Now del_timer_sync() will never return and never release somelock.
* The interrupt on the other CPU is waiting to grab somelock but
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 0ed768b56c60..d647dabdac97 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -28,8 +28,6 @@ struct timer_list_iter {
u64 now;
};
-typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
-
/*
* This allows printing both to /proc/timer_list and
* to the console (on SysRq-Q):
@@ -372,24 +370,12 @@ static const struct seq_operations timer_list_sops = {
.show = timer_list_show,
};
-static int timer_list_open(struct inode *inode, struct file *filp)
-{
- return seq_open_private(filp, &timer_list_sops,
- sizeof(struct timer_list_iter));
-}
-
-static const struct file_operations timer_list_fops = {
- .open = timer_list_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
static int __init init_timer_list_procfs(void)
{
struct proc_dir_entry *pe;
- pe = proc_create("timer_list", 0400, NULL, &timer_list_fops);
+ pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops,
+ sizeof(struct timer_list_iter), NULL);
if (!pe)
return -ENOMEM;
return 0;
diff --git a/kernel/torture.c b/kernel/torture.c
index 37b94012a3f8..3de1efbecd6a 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -574,7 +574,7 @@ void stutter_wait(const char *title)
{
int spt;
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
spt = READ_ONCE(stutter_pause_test);
for (; spt; spt = READ_ONCE(stutter_pause_test)) {
if (spt == 1) {
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 0b249e2f0c3c..c4f0f2e4126e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -606,7 +606,10 @@ config HIST_TRIGGERS
event activity as an initial guide for further investigation
using more advanced tools.
- See Documentation/trace/events.txt.
+ Inter-event tracing of quantities such as latencies is also
+ supported using hist triggers under this option.
+
+ See Documentation/trace/histogram.txt.
If in doubt, say N.
config MMIOTRACE_TEST
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c0a9e310d715..0ae6829804bc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,12 +14,14 @@
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
+#include <linux/syscalls.h>
#include <linux/error-injection.h>
#include "trace_probe.h"
#include "trace.h"
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
/**
* trace_call_bpf - invoke BPF program
@@ -474,8 +476,6 @@ BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct cgroup *cgrp;
- if (unlikely(in_interrupt()))
- return -EINVAL;
if (unlikely(idx >= array->map.max_entries))
return -E2BIG;
@@ -524,7 +524,8 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
.arg3_type = ARG_ANYTHING,
};
-static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *
+tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -563,18 +564,25 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_probe_read_str:
return &bpf_probe_read_str_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+#endif
default:
return NULL;
}
}
-static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *
+kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto;
case BPF_FUNC_perf_event_read_value:
return &bpf_perf_event_read_value_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -582,12 +590,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_override_return_proto;
#endif
default:
- return tracing_func_proto(func_id);
+ return tracing_func_proto(func_id, prog);
}
}
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= sizeof(struct pt_regs))
@@ -661,7 +670,64 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+ u64, flags)
+{
+ struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+ return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+ (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+ .func = bpf_get_stack_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_perf_event_output_proto_tp;
+ case BPF_FUNC_get_stackid:
+ return &bpf_get_stackid_proto_tp;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto_tp;
+ default:
+ return tracing_func_proto(func_id, prog);
+ }
+}
+
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
+ return false;
+ if (type != BPF_READ)
+ return false;
+ if (off % size != 0)
+ return false;
+
+ BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
+ return true;
+}
+
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
+ .get_func_proto = tp_prog_func_proto,
+ .is_valid_access = tp_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
+BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx,
struct bpf_perf_event_value *, buf, u32, size)
{
int err = -EINVAL;
@@ -678,8 +744,8 @@ clear:
return err;
}
-static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
- .func = bpf_perf_prog_read_value_tp,
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
+ .func = bpf_perf_prog_read_value,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -687,59 +753,155 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
.arg3_type = ARG_CONST_SIZE,
};
-static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *
+pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto_tp;
case BPF_FUNC_perf_prog_read_value:
- return &bpf_perf_prog_read_value_proto_tp;
+ return &bpf_perf_prog_read_value_proto;
default:
- return tracing_func_proto(func_id);
+ return tracing_func_proto(func_id, prog);
}
}
-static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- struct bpf_insn_access_aux *info)
+/*
+ * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
+ * to avoid potential recursive reuse issue when/if tracepoints are added
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
+ */
+static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
+ struct bpf_map *, map, u64, flags, void *, data, u64, size)
{
- if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
+ struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+ perf_fetch_caller_regs(regs);
+ return ____bpf_perf_event_output(regs, map, flags, data, size);
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
+ .func = bpf_perf_event_output_raw_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
+ struct bpf_map *, map, u64, flags)
+{
+ struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+ perf_fetch_caller_regs(regs);
+ /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
+ return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+ flags, 0, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
+ .func = bpf_get_stackid_raw_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+ void *, buf, u32, size, u64, flags)
+{
+ struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+ perf_fetch_caller_regs(regs);
+ return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+ (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+ .func = bpf_get_stack_raw_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_perf_event_output_proto_raw_tp;
+ case BPF_FUNC_get_stackid:
+ return &bpf_get_stackid_proto_raw_tp;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto_raw_tp;
+ default:
+ return tracing_func_proto(func_id, prog);
+ }
+}
+
+static bool raw_tp_prog_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ /* largest tracepoint in the kernel has 12 args */
+ if (off < 0 || off >= sizeof(__u64) * 12)
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
-
- BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
return true;
}
-const struct bpf_verifier_ops tracepoint_verifier_ops = {
- .get_func_proto = tp_prog_func_proto,
- .is_valid_access = tp_prog_is_valid_access,
+const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
+ .get_func_proto = raw_tp_prog_func_proto,
+ .is_valid_access = raw_tp_prog_is_valid_access,
};
-const struct bpf_prog_ops tracepoint_prog_ops = {
+const struct bpf_prog_ops raw_tracepoint_prog_ops = {
};
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
- sample_period);
+ const int size_u64 = sizeof(u64);
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
return false;
if (type != BPF_READ)
return false;
- if (off % size != 0)
- return false;
+ if (off % size != 0) {
+ if (sizeof(unsigned long) != 4)
+ return false;
+ if (size != 8)
+ return false;
+ if (off % size != 4)
+ return false;
+ }
switch (off) {
case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
- bpf_ctx_record_field_size(info, size_sp);
- if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
+ bpf_ctx_record_field_size(info, size_u64);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
+ return false;
+ break;
+ case bpf_ctx_range(struct bpf_perf_event_data, addr):
+ bpf_ctx_record_field_size(info, size_u64);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
return false;
break;
default:
@@ -766,6 +928,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
bpf_target_off(struct perf_sample_data, period, 8,
target_size));
break;
+ case offsetof(struct bpf_perf_event_data, addr):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+ data), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_perf_event_data_kern, data));
+ *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct perf_sample_data, addr, 8,
+ target_size));
+ break;
default:
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
regs), si->dst_reg, si->src_reg,
@@ -779,7 +949,7 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
}
const struct bpf_verifier_ops perf_event_verifier_ops = {
- .get_func_proto = tp_prog_func_proto,
+ .get_func_proto = pe_prog_func_proto,
.is_valid_access = pe_prog_is_valid_access,
.convert_ctx_access = pe_prog_convert_ctx_access,
};
@@ -846,6 +1016,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
old_array = event->tp_event->prog_array;
ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+ if (ret == -ENOENT)
+ goto unlock;
if (ret < 0) {
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
@@ -864,6 +1036,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
{
struct perf_event_query_bpf __user *uquery = info;
struct perf_event_query_bpf query = {};
+ u32 *ids, prog_cnt, ids_len;
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -872,15 +1045,181 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
return -EINVAL;
if (copy_from_user(&query, uquery, sizeof(query)))
return -EFAULT;
- if (query.ids_len > BPF_TRACE_MAX_PROGS)
+
+ ids_len = query.ids_len;
+ if (ids_len > BPF_TRACE_MAX_PROGS)
return -E2BIG;
+ ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN);
+ if (!ids)
+ return -ENOMEM;
+ /*
+ * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which
+ * is required when user only wants to check for uquery->prog_cnt.
+ * There is no need to check for it since the case is handled
+ * gracefully in bpf_prog_array_copy_info.
+ */
mutex_lock(&bpf_event_mutex);
ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
- uquery->ids,
- query.ids_len,
- &uquery->prog_cnt);
+ ids,
+ ids_len,
+ &prog_cnt);
mutex_unlock(&bpf_event_mutex);
+ if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
+ copy_to_user(uquery->ids, ids, ids_len * sizeof(u32)))
+ ret = -EFAULT;
+
+ kfree(ids);
return ret;
}
+
+extern struct bpf_raw_event_map __start__bpf_raw_tp[];
+extern struct bpf_raw_event_map __stop__bpf_raw_tp[];
+
+struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
+{
+ struct bpf_raw_event_map *btp = __start__bpf_raw_tp;
+
+ for (; btp < __stop__bpf_raw_tp; btp++) {
+ if (!strcmp(btp->tp->name, name))
+ return btp;
+ }
+ return NULL;
+}
+
+static __always_inline
+void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
+{
+ rcu_read_lock();
+ preempt_disable();
+ (void) BPF_PROG_RUN(prog, args);
+ preempt_enable();
+ rcu_read_unlock();
+}
+
+#define UNPACK(...) __VA_ARGS__
+#define REPEAT_1(FN, DL, X, ...) FN(X)
+#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__)
+#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__)
+#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__)
+#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__)
+#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__)
+#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__)
+#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__)
+#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__)
+#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__)
+#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__)
+#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__)
+#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__)
+
+#define SARG(X) u64 arg##X
+#define COPY(X) args[X] = arg##X
+
+#define __DL_COM (,)
+#define __DL_SEM (;)
+
+#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+
+#define BPF_TRACE_DEFN_x(x) \
+ void bpf_trace_run##x(struct bpf_prog *prog, \
+ REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \
+ { \
+ u64 args[x]; \
+ REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \
+ __bpf_trace_run(prog, args); \
+ } \
+ EXPORT_SYMBOL_GPL(bpf_trace_run##x)
+BPF_TRACE_DEFN_x(1);
+BPF_TRACE_DEFN_x(2);
+BPF_TRACE_DEFN_x(3);
+BPF_TRACE_DEFN_x(4);
+BPF_TRACE_DEFN_x(5);
+BPF_TRACE_DEFN_x(6);
+BPF_TRACE_DEFN_x(7);
+BPF_TRACE_DEFN_x(8);
+BPF_TRACE_DEFN_x(9);
+BPF_TRACE_DEFN_x(10);
+BPF_TRACE_DEFN_x(11);
+BPF_TRACE_DEFN_x(12);
+
+static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+{
+ struct tracepoint *tp = btp->tp;
+
+ /*
+ * check that program doesn't access arguments beyond what's
+ * available in this tracepoint
+ */
+ if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))
+ return -EINVAL;
+
+ return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);
+}
+
+int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+{
+ int err;
+
+ mutex_lock(&bpf_event_mutex);
+ err = __bpf_probe_register(btp, prog);
+ mutex_unlock(&bpf_event_mutex);
+ return err;
+}
+
+int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+{
+ int err;
+
+ mutex_lock(&bpf_event_mutex);
+ err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
+ mutex_unlock(&bpf_event_mutex);
+ return err;
+}
+
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+ u32 *fd_type, const char **buf,
+ u64 *probe_offset, u64 *probe_addr)
+{
+ bool is_tracepoint, is_syscall_tp;
+ struct bpf_prog *prog;
+ int flags, err = 0;
+
+ prog = event->prog;
+ if (!prog)
+ return -ENOENT;
+
+ /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
+ if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
+ return -EOPNOTSUPP;
+
+ *prog_id = prog->aux->id;
+ flags = event->tp_event->flags;
+ is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+
+ if (is_tracepoint || is_syscall_tp) {
+ *buf = is_tracepoint ? event->tp_event->tp->name
+ : event->tp_event->name;
+ *fd_type = BPF_FD_TYPE_TRACEPOINT;
+ *probe_offset = 0x0;
+ *probe_addr = 0x0;
+ } else {
+ /* kprobe/uprobe */
+ err = -EOPNOTSUPP;
+#ifdef CONFIG_KPROBE_EVENTS
+ if (flags & TRACE_EVENT_FL_KPROBE)
+ err = bpf_get_kprobe_info(event, fd_type, buf,
+ probe_offset, probe_addr,
+ event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (flags & TRACE_EVENT_FL_UPROBE)
+ err = bpf_get_uprobe_info(event, fd_type, buf,
+ probe_offset,
+ event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+ }
+
+ return err;
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eac9ce2c57a2..8d83bcf9ef69 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3902,14 +3902,13 @@ static bool module_exists(const char *module)
{
/* All modules have the symbol __this_module */
const char this_mod[] = "__this_module";
- const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1;
- char modname[modname_size + 1];
+ char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
unsigned long val;
int n;
- n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod);
+ n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
- if (n > modname_size)
+ if (n > sizeof(modname) - 1)
return false;
val = module_kallsyms_lookup_name(modname);
@@ -5515,10 +5514,10 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- trace_create_file("set_graph_function", 0444, d_tracer,
+ trace_create_file("set_graph_function", 0644, d_tracer,
NULL,
&ftrace_graph_fops);
- trace_create_file("set_graph_notrace", 0444, d_tracer,
+ trace_create_file("set_graph_notrace", 0644, d_tracer,
NULL,
&ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dcf1c4dd3efe..6a46af21765c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -22,6 +22,7 @@
#include <linux/hash.h>
#include <linux/list.h>
#include <linux/cpu.h>
+#include <linux/oom.h>
#include <asm/local.h>
@@ -41,6 +42,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
RINGBUF_TYPE_PADDING);
trace_seq_printf(s, "\ttime_extend : type == %d\n",
RINGBUF_TYPE_TIME_EXTEND);
+ trace_seq_printf(s, "\ttime_stamp : type == %d\n",
+ RINGBUF_TYPE_TIME_STAMP);
trace_seq_printf(s, "\tdata max type_len == %d\n",
RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
@@ -140,12 +143,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
enum {
RB_LEN_TIME_EXTEND = 8,
- RB_LEN_TIME_STAMP = 16,
+ RB_LEN_TIME_STAMP = 8,
};
#define skip_time_extend(event) \
((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
+#define extended_time(event) \
+ (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
+
static inline int rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -209,7 +215,7 @@ rb_event_ts_length(struct ring_buffer_event *event)
{
unsigned len = 0;
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ if (extended_time(event)) {
/* time extends include the data event after it */
len = RB_LEN_TIME_EXTEND;
event = skip_time_extend(event);
@@ -231,7 +237,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
unsigned length;
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ if (extended_time(event))
event = skip_time_extend(event);
length = rb_event_length(event);
@@ -248,7 +254,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
static __always_inline void *
rb_event_data(struct ring_buffer_event *event)
{
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ if (extended_time(event))
event = skip_time_extend(event);
BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
@@ -275,6 +281,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
+/**
+ * ring_buffer_event_time_stamp - return the event's extended timestamp
+ * @event: the event to get the timestamp of
+ *
+ * Returns the extended timestamp associated with a data event.
+ * An extended time_stamp is a 64-bit timestamp represented
+ * internally in a special way that makes the best use of space
+ * contained within a ring buffer event. This function decodes
+ * it and maps it to a straight u64 value.
+ */
+u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
+{
+ u64 ts;
+
+ ts = event->array[0];
+ ts <<= TS_SHIFT;
+ ts += event->time_delta;
+
+ return ts;
+}
+
/* Flag when events were overwritten */
#define RB_MISSED_EVENTS (1 << 31)
/* Missed count stored at end */
@@ -451,6 +478,7 @@ struct ring_buffer_per_cpu {
struct buffer_page *reader_page;
unsigned long lost_events;
unsigned long last_overrun;
+ unsigned long nest;
local_t entries_bytes;
local_t entries;
local_t overrun;
@@ -488,6 +516,7 @@ struct ring_buffer {
u64 (*clock)(void);
struct rb_irq_work irq_work;
+ bool time_stamp_abs;
};
struct ring_buffer_iter {
@@ -780,7 +809,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
*
* You can see, it is legitimate for the previous pointer of
* the head (or any page) not to point back to itself. But only
- * temporarially.
+ * temporarily.
*/
#define RB_PAGE_NORMAL 0UL
@@ -877,7 +906,7 @@ static void rb_list_head_clear(struct list_head *list)
}
/*
- * rb_head_page_dactivate - clears head page ptr (for free list)
+ * rb_head_page_deactivate - clears head page ptr (for free list)
*/
static void
rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
@@ -1134,30 +1163,60 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
{
struct buffer_page *bpage, *tmp;
+ bool user_thread = current->mm != NULL;
+ gfp_t mflags;
long i;
+ /*
+ * Check if the available memory is there first.
+ * Note, si_mem_available() only gives us a rough estimate of available
+ * memory. It may not be accurate. But we don't care, we just want
+ * to prevent doing any allocation when it is obvious that it is
+ * not going to succeed.
+ */
+ i = si_mem_available();
+ if (i < nr_pages)
+ return -ENOMEM;
+
+ /*
+ * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is not
+ * destabilized.
+ */
+ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
+
+ /*
+ * If a user thread allocates too much, and si_mem_available()
+ * reports there's enough memory, even though there is not.
+ * Make sure the OOM killer kills this thread. This can happen
+ * even with RETRY_MAYFAIL because another task may be doing
+ * an allocation after this task has taken all memory.
+ * This is the task the OOM killer needs to take out during this
+ * loop, even if it was triggered by an allocation somewhere else.
+ */
+ if (user_thread)
+ set_current_oom_origin();
for (i = 0; i < nr_pages; i++) {
struct page *page;
- /*
- * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
- * gracefully without invoking oom-killer and the system is not
- * destabilized.
- */
+
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- GFP_KERNEL | __GFP_RETRY_MAYFAIL,
- cpu_to_node(cpu));
+ mflags, cpu_to_node(cpu));
if (!bpage)
goto free_pages;
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
+ page = alloc_pages_node(cpu_to_node(cpu), mflags, 0);
if (!page)
goto free_pages;
bpage->page = page_address(page);
rb_init_page(bpage->page);
+
+ if (user_thread && fatal_signal_pending(current))
+ goto free_pages;
}
+ if (user_thread)
+ clear_current_oom_origin();
return 0;
@@ -1166,6 +1225,8 @@ free_pages:
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
+ if (user_thread)
+ clear_current_oom_origin();
return -ENOMEM;
}
@@ -1382,6 +1443,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
buffer->clock = clock;
}
+void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
+{
+ buffer->time_stamp_abs = abs;
+}
+
+bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
+{
+ return buffer->time_stamp_abs;
+}
+
static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
static inline unsigned long rb_page_entries(struct buffer_page *bpage)
@@ -1709,7 +1780,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
put_online_cpus();
} else {
- /* Make sure this CPU has been intitialized */
+ /* Make sure this CPU has been initialized */
if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
goto out;
@@ -2206,12 +2277,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Slow path, do not inline */
static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
{
- event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+ if (abs)
+ event->type_len = RINGBUF_TYPE_TIME_STAMP;
+ else
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
- /* Not the first event on the page? */
- if (rb_event_index(event)) {
+ /* Not the first event on the page, or not delta? */
+ if (abs || rb_event_index(event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
@@ -2251,10 +2325,12 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
/*
* If we need to add a timestamp, then we
- * add it to the start of the resevered space.
+ * add it to the start of the reserved space.
*/
if (unlikely(info->add_timestamp)) {
- event = rb_add_time_stamp(event, delta);
+ bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
+
+ event = rb_add_time_stamp(event, info->delta, abs);
length -= RB_LEN_TIME_EXTEND;
delta = 0;
}
@@ -2442,7 +2518,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer
static inline void rb_event_discard(struct ring_buffer_event *event)
{
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ if (extended_time(event))
event = skip_time_extend(event);
/* array[0] holds the actual length for the discarded event */
@@ -2486,10 +2562,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
cpu_buffer->write_stamp =
cpu_buffer->commit_page->page->time_stamp;
else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
- delta = event->array[0];
- delta <<= TS_SHIFT;
- delta += event->time_delta;
+ delta = ring_buffer_event_time_stamp(event);
cpu_buffer->write_stamp += delta;
+ } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
+ delta = ring_buffer_event_time_stamp(event);
+ cpu_buffer->write_stamp = delta;
} else
cpu_buffer->write_stamp += event->time_delta;
}
@@ -2581,10 +2658,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
bit = pc & NMI_MASK ? RB_CTX_NMI :
pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
- if (unlikely(val & (1 << bit)))
+ if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
return 1;
- val |= (1 << bit);
+ val |= (1 << (bit + cpu_buffer->nest));
cpu_buffer->current_context = val;
return 0;
@@ -2593,7 +2670,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+ cpu_buffer->current_context &=
+ cpu_buffer->current_context - (1 << cpu_buffer->nest);
+}
+
+/* The recursive locking above uses 4 bits */
+#define NESTED_BITS 4
+
+/**
+ * ring_buffer_nest_start - Allow to trace while nested
+ * @buffer: The ring buffer to modify
+ *
+ * The ring buffer has a safety mechanism to prevent recursion.
+ * But there may be a case where a trace needs to be done while
+ * tracing something else. In this case, calling this function
+ * will allow this function to nest within a currently active
+ * ring_buffer_lock_reserve().
+ *
+ * Call this function before calling another ring_buffer_lock_reserve() and
+ * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
+ */
+void ring_buffer_nest_start(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* Enabled by ring_buffer_nest_end() */
+ preempt_disable_notrace();
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ /* This is the shift value for the above recursive locking */
+ cpu_buffer->nest += NESTED_BITS;
+}
+
+/**
+ * ring_buffer_nest_end - Allow to trace while nested
+ * @buffer: The ring buffer to modify
+ *
+ * Must be called after ring_buffer_nest_start() and after the
+ * ring_buffer_unlock_commit().
+ */
+void ring_buffer_nest_end(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* disabled by ring_buffer_nest_start() */
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ /* This is the shift value for the above recursive locking */
+ cpu_buffer->nest -= NESTED_BITS;
+ preempt_enable_notrace();
}
/**
@@ -2637,7 +2764,8 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
sched_clock_stable() ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n");
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n"
+ "or add trace_clock=global to the kernel command line\n");
info->add_timestamp = 1;
}
@@ -2669,7 +2797,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* If this is the first commit on the page, then it has the same
* timestamp as the page itself.
*/
- if (!tail)
+ if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
info->delta = 0;
/* See if we shot pass the end of this buffer page */
@@ -2746,8 +2874,11 @@ rb_reserve_next_event(struct ring_buffer *buffer,
/* make sure this diff is calculated here */
barrier();
- /* Did the write stamp get updated already? */
- if (likely(info.ts >= cpu_buffer->write_stamp)) {
+ if (ring_buffer_time_stamp_abs(buffer)) {
+ info.delta = info.ts;
+ rb_handle_timestamp(cpu_buffer, &info);
+ } else /* Did the write stamp get updated already? */
+ if (likely(info.ts >= cpu_buffer->write_stamp)) {
info.delta = diff;
if (unlikely(test_time_stamp(info.delta)))
rb_handle_timestamp(cpu_buffer, &info);
@@ -2776,7 +2907,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
* @buffer: the ring buffer to reserve from
* @length: the length of the data to reserve (excluding event header)
*
- * Returns a reseverd event on the ring buffer to copy directly to.
+ * Returns a reserved event on the ring buffer to copy directly to.
* The user of this interface will need to get the body to write into
* and can use the ring_buffer_event_data() interface.
*
@@ -2878,7 +3009,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
* This function lets the user discard an event in the ring buffer
* and then that event will not be read later.
*
- * This function only works if it is called before the the item has been
+ * This function only works if it is called before the item has been
* committed. It will try to free the event from the ring buffer
* if another event has not been added behind it.
*
@@ -3429,14 +3560,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return;
case RINGBUF_TYPE_TIME_EXTEND:
- delta = event->array[0];
- delta <<= TS_SHIFT;
- delta += event->time_delta;
+ delta = ring_buffer_event_time_stamp(event);
cpu_buffer->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
- /* FIXME: not implemented */
+ delta = ring_buffer_event_time_stamp(event);
+ cpu_buffer->read_stamp = delta;
return;
case RINGBUF_TYPE_DATA:
@@ -3460,14 +3590,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
return;
case RINGBUF_TYPE_TIME_EXTEND:
- delta = event->array[0];
- delta <<= TS_SHIFT;
- delta += event->time_delta;
+ delta = ring_buffer_event_time_stamp(event);
iter->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
- /* FIXME: not implemented */
+ delta = ring_buffer_event_time_stamp(event);
+ iter->read_stamp = delta;
return;
case RINGBUF_TYPE_DATA:
@@ -3691,6 +3820,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
struct buffer_page *reader;
int nr_loops = 0;
+ if (ts)
+ *ts = 0;
again:
/*
* We repeat when a time extend is encountered.
@@ -3727,12 +3858,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
goto again;
case RINGBUF_TYPE_TIME_STAMP:
- /* FIXME: not implemented */
+ if (ts) {
+ *ts = ring_buffer_event_time_stamp(event);
+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
+ cpu_buffer->cpu, ts);
+ }
+ /* Internal data, OK to advance */
rb_advance_reader(cpu_buffer);
goto again;
case RINGBUF_TYPE_DATA:
- if (ts) {
+ if (ts && !(*ts)) {
*ts = cpu_buffer->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
@@ -3757,6 +3893,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_event *event;
int nr_loops = 0;
+ if (ts)
+ *ts = 0;
+
cpu_buffer = iter->cpu_buffer;
buffer = cpu_buffer->buffer;
@@ -3809,12 +3948,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
goto again;
case RINGBUF_TYPE_TIME_STAMP:
- /* FIXME: not implemented */
+ if (ts) {
+ *ts = ring_buffer_event_time_stamp(event);
+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
+ cpu_buffer->cpu, ts);
+ }
+ /* Internal data, OK to advance */
rb_advance_iter(iter);
goto again;
case RINGBUF_TYPE_DATA:
- if (ts) {
+ if (ts && !(*ts)) {
*ts = iter->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(buffer,
cpu_buffer->cpu, ts);
@@ -3983,7 +4127,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* through the buffer. Memory is allocated, buffer recording
* is disabled, and the iterator pointer is returned to the caller.
*
- * Disabling buffer recordng prevents the reading from being
+ * Disabling buffer recording prevents the reading from being
* corrupted. This is not a consuming read, so a producer is not
* expected.
*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 20a2300ae4e8..108ce3e1dc13 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,6 +41,7 @@
#include <linux/nmi.h>
#include <linux/fs.h>
#include <linux/trace.h>
+#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
#include "trace.h"
@@ -892,7 +893,7 @@ int __trace_bputs(unsigned long ip, const char *str)
EXPORT_SYMBOL_GPL(__trace_bputs);
#ifdef CONFIG_TRACER_SNAPSHOT
-static void tracing_snapshot_instance(struct trace_array *tr)
+void tracing_snapshot_instance(struct trace_array *tr)
{
struct tracer *tracer = tr->current_trace;
unsigned long flags;
@@ -948,7 +949,7 @@ static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
struct trace_buffer *size_buf, int cpu_id);
static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
-static int alloc_snapshot(struct trace_array *tr)
+int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
int ret;
@@ -994,7 +995,7 @@ int tracing_alloc_snapshot(void)
struct trace_array *tr = &global_trace;
int ret;
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
WARN_ON(ret < 0);
return ret;
@@ -1168,6 +1169,14 @@ static struct {
ARCH_TRACE_CLOCKS
};
+bool trace_clock_in_ns(struct trace_array *tr)
+{
+ if (trace_clocks[tr->clock_id].in_ns)
+ return true;
+
+ return false;
+}
+
/*
* trace_parser_get_init - gets the buffer for trace parser
*/
@@ -2269,7 +2278,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
*current_rb = trace_file->tr->trace_buffer.buffer;
- if ((trace_file->flags &
+ if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
(EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
(entry = this_cpu_read(trace_buffered_event))) {
/* Try to use the per cpu buffer first */
@@ -2380,7 +2389,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
* trace_buffer_unlock_commit_regs()
* trace_event_buffer_commit()
* trace_event_raw_event_xxx()
-*/
+ */
# define STACK_SKIP 3
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
@@ -4386,8 +4395,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
{
char *cmp;
int neg = 0;
- int ret = -ENODEV;
- int i;
+ int ret;
size_t orig_len = strlen(option);
cmp = strstrip(option);
@@ -4399,16 +4407,12 @@ static int trace_set_options(struct trace_array *tr, char *option)
mutex_lock(&trace_types_lock);
- for (i = 0; trace_options[i]; i++) {
- if (strcmp(cmp, trace_options[i]) == 0) {
- ret = set_tracer_flag(tr, 1 << i, !neg);
- break;
- }
- }
-
+ ret = match_string(trace_options, -1, cmp);
/* If no option could be set, test the specific tracer options */
- if (!trace_options[i])
+ if (ret < 0)
ret = set_tracer_option(tr, cmp, neg);
+ else
+ ret = set_tracer_flag(tr, 1 << ret, !neg);
mutex_unlock(&trace_types_lock);
@@ -4515,6 +4519,9 @@ static const char readme_msg[] =
#ifdef CONFIG_X86_64
" x86-tsc: TSC cycle counter\n"
#endif
+ "\n timestamp_mode\t-view the mode used to timestamp events\n"
+ " delta: Delta difference against a buffer-wide timestamp\n"
+ " absolute: Absolute (standalone) timestamp\n"
"\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
"\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
" tracing_cpumask\t- Limit which CPUs to trace\n"
@@ -4691,8 +4698,9 @@ static const char readme_msg[] =
"\t .sym display an address as a symbol\n"
"\t .sym-offset display an address as a symbol and offset\n"
"\t .execname display a common_pid as a program name\n"
- "\t .syscall display a syscall id as a syscall name\n\n"
- "\t .log2 display log2 value rather than raw number\n\n"
+ "\t .syscall display a syscall id as a syscall name\n"
+ "\t .log2 display log2 value rather than raw number\n"
+ "\t .usecs display a common_timestamp in microseconds\n\n"
"\t The 'pause' parameter can be used to pause an existing hist\n"
"\t trigger or to start a hist trigger but not log any events\n"
"\t until told to do so. 'continue' can be used to start or\n"
@@ -5395,7 +5403,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
goto out;
}
@@ -6061,6 +6069,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
{
struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
+ enum event_trigger_type tt = ETT_NONE;
struct ring_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
@@ -6109,6 +6118,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
written = cnt;
len = cnt;
+ if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
+ /* do not add \n before testing triggers, but add \0 */
+ entry->buf[cnt] = '\0';
+ tt = event_triggers_call(tr->trace_marker_file, entry, event);
+ }
+
if (entry->buf[cnt - 1] != '\n') {
entry->buf[cnt] = '\n';
entry->buf[cnt + 1] = '\0';
@@ -6117,6 +6132,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
__buffer_unlock_commit(buffer, event);
+ if (tt)
+ event_triggers_post_call(tr->trace_marker_file, tt);
+
if (written > 0)
*fpos += written;
@@ -6202,7 +6220,7 @@ static int tracing_clock_show(struct seq_file *m, void *v)
return 0;
}
-static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
+int tracing_set_clock(struct trace_array *tr, const char *clockstr)
{
int i;
@@ -6282,6 +6300,71 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
return ret;
}
+static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
+{
+ struct trace_array *tr = m->private;
+
+ mutex_lock(&trace_types_lock);
+
+ if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
+ seq_puts(m, "delta [absolute]\n");
+ else
+ seq_puts(m, "[delta] absolute\n");
+
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+}
+
+static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (trace_array_get(tr))
+ return -ENODEV;
+
+ ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
+{
+ int ret = 0;
+
+ mutex_lock(&trace_types_lock);
+
+ if (abs && tr->time_stamp_abs_ref++)
+ goto out;
+
+ if (!abs) {
+ if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (--tr->time_stamp_abs_ref)
+ goto out;
+ }
+
+ ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->max_buffer.buffer)
+ ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
+#endif
+ out:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
@@ -6373,7 +6456,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
#endif
if (!tr->allocated_snapshot) {
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
break;
}
@@ -6529,6 +6612,13 @@ static const struct file_operations trace_clock_fops = {
.write = tracing_clock_write,
};
+static const struct file_operations trace_time_stamp_mode_fops = {
+ .open = tracing_time_stamp_mode_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_single_release_tr,
+};
+
#ifdef CONFIG_TRACER_SNAPSHOT
static const struct file_operations snapshot_fops = {
.open = tracing_snapshot_open,
@@ -7094,7 +7184,7 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
return ret;
out_reg:
- ret = alloc_snapshot(tr);
+ ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
goto out;
@@ -7699,6 +7789,7 @@ static int instance_mkdir(const char *name)
INIT_LIST_HEAD(&tr->systems);
INIT_LIST_HEAD(&tr->events);
+ INIT_LIST_HEAD(&tr->hist_vars);
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
@@ -7810,6 +7901,7 @@ static __init void create_trace_instances(struct dentry *d_tracer)
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
+ struct trace_event_file *file;
int cpu;
trace_create_file("available_tracers", 0444, d_tracer,
@@ -7842,6 +7934,12 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
+ file = __find_event_file(tr, "ftrace", "print");
+ if (file && file->dir)
+ trace_create_file("trigger", 0644, file->dir, file,
+ &event_trigger_fops);
+ tr->trace_marker_file = file;
+
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
@@ -7851,6 +7949,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("tracing_on", 0644, d_tracer,
tr, &rb_simple_fops);
+ trace_create_file("timestamp_mode", 0444, d_tracer, tr,
+ &trace_time_stamp_mode_fops);
+
create_trace_options_dir(tr);
#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
@@ -8022,6 +8123,8 @@ static __init int tracer_init_tracefs(void)
if (IS_ERR(d_tracer))
return 0;
+ event_trace_init();
+
init_tracer_tracefs(&global_trace, d_tracer);
ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
@@ -8446,6 +8549,7 @@ __init static int tracer_alloc_buffers(void)
INIT_LIST_HEAD(&global_trace.systems);
INIT_LIST_HEAD(&global_trace.events);
+ INIT_LIST_HEAD(&global_trace.hist_vars);
list_add(&global_trace.list, &ftrace_trace_arrays);
apply_trace_boot_options();
@@ -8507,3 +8611,21 @@ __init static int clear_boot_tracer(void)
fs_initcall(tracer_init_tracefs);
late_initcall_sync(clear_boot_tracer);
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+__init static int tracing_set_default_clock(void)
+{
+ /* sched_clock_stable() is determined in late_initcall */
+ if (!trace_boot_clock && !sched_clock_stable()) {
+ printk(KERN_WARNING
+ "Unstable clock detected, switching default tracing clock to \"global\"\n"
+ "If you want to keep using the local clock, then add:\n"
+ " \"trace_clock=local\"\n"
+ "on the kernel command line\n");
+ tracing_set_clock(&global_trace, "global");
+ }
+
+ return 0;
+}
+late_initcall_sync(tracing_set_default_clock);
+#endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2a6d0325a761..630c5a24b2b2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -259,6 +259,7 @@ struct trace_array {
struct trace_options *topts;
struct list_head systems;
struct list_head events;
+ struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
int ref;
#ifdef CONFIG_FUNCTION_TRACER
@@ -273,6 +274,8 @@ struct trace_array {
/* function tracing enabled */
int function_enabled;
#endif
+ int time_stamp_abs_ref;
+ struct list_head hist_vars;
};
enum {
@@ -286,6 +289,11 @@ extern struct mutex trace_types_lock;
extern int trace_array_get(struct trace_array *tr);
extern void trace_array_put(struct trace_array *tr);
+extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
+extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
+
+extern bool trace_clock_in_ns(struct trace_array *tr);
+
/*
* The global tracer (top) should be the first trace array added,
* but we check the flag anyway.
@@ -1209,12 +1217,11 @@ struct ftrace_event_field {
int is_signed;
};
+struct prog_entry;
+
struct event_filter {
- int n_preds; /* Number assigned */
- int a_preds; /* allocated */
- struct filter_pred __rcu *preds;
- struct filter_pred __rcu *root;
- char *filter_string;
+ struct prog_entry __rcu *prog;
+ char *filter_string;
};
struct event_subsystem {
@@ -1291,7 +1298,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
unsigned long eflags = file->flags;
if (eflags & EVENT_FILE_FL_TRIGGER_COND)
- *tt = event_triggers_call(file, entry);
+ *tt = event_triggers_call(file, entry, event);
if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
(unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
@@ -1328,7 +1335,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
if (tt)
- event_triggers_post_call(file, tt, entry);
+ event_triggers_post_call(file, tt);
}
/**
@@ -1361,7 +1368,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
irq_flags, pc, regs);
if (tt)
- event_triggers_post_call(file, tt, entry);
+ event_triggers_post_call(file, tt);
}
#define FILTER_PRED_INVALID ((unsigned short)-1)
@@ -1406,12 +1413,8 @@ struct filter_pred {
unsigned short *ops;
struct ftrace_event_field *field;
int offset;
- int not;
+ int not;
int op;
- unsigned short index;
- unsigned short parent;
- unsigned short left;
- unsigned short right;
};
static inline bool is_string_field(struct ftrace_event_field *field)
@@ -1449,9 +1452,13 @@ trace_find_event_field(struct trace_event_call *call, char *name);
extern void trace_event_enable_cmd_record(bool enable);
extern void trace_event_enable_tgid_record(bool enable);
+extern int event_trace_init(void);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
+extern struct trace_event_file *__find_event_file(struct trace_array *tr,
+ const char *system,
+ const char *event);
extern struct trace_event_file *find_event_file(struct trace_array *tr,
const char *system,
const char *event);
@@ -1543,6 +1550,8 @@ extern void pause_named_trigger(struct event_trigger_data *data);
extern void unpause_named_trigger(struct event_trigger_data *data);
extern void set_named_trigger_data(struct event_trigger_data *data,
struct event_trigger_data *named_data);
+extern struct event_trigger_data *
+get_named_trigger_data(struct event_trigger_data *data);
extern int register_event_command(struct event_command *cmd);
extern int unregister_event_command(struct event_command *cmd);
extern int register_trigger_hist_enable_disable_cmds(void);
@@ -1586,7 +1595,8 @@ extern int register_trigger_hist_enable_disable_cmds(void);
*/
struct event_trigger_ops {
void (*func)(struct event_trigger_data *data,
- void *rec);
+ void *rec,
+ struct ring_buffer_event *rbe);
int (*init)(struct event_trigger_ops *ops,
struct event_trigger_data *data);
void (*free)(struct event_trigger_ops *ops,
@@ -1812,6 +1822,17 @@ static inline void __init trace_event_init(void) { }
static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+void tracing_snapshot_instance(struct trace_array *tr);
+int tracing_alloc_snapshot_instance(struct trace_array *tr);
+#else
+static inline void tracing_snapshot_instance(struct trace_array *tr) { }
+static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
+{
+ return 0;
+}
+#endif
+
extern struct trace_iterator *tracepoint_print_iter;
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 22fee766081b..80e0b2aca703 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg)
* wants to run, schedule in, but if the CPU is idle,
* we'll keep burning cycles.
*
- * Note the _rcu_qs() version of cond_resched() will
+ * Note the tasks_rcu_qs() version of cond_resched() will
* notify synchronize_rcu_tasks() that this thread has
* passed a quiescent state for rcu_tasks. Otherwise
* this thread will never voluntarily schedule which would
* block synchronize_rcu_tasks() indefinitely.
*/
- cond_resched();
+ cond_resched_tasks_rcu_qs();
}
return 0;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 5fdc779f411d..d8a188e0418a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -96,7 +96,7 @@ u64 notrace trace_clock_global(void)
int this_cpu;
u64 now;
- local_irq_save(flags);
+ raw_local_irq_save(flags);
this_cpu = raw_smp_processor_id();
now = sched_clock_cpu(this_cpu);
@@ -122,7 +122,7 @@ u64 notrace trace_clock_global(void)
arch_spin_unlock(&trace_clock_struct.lock);
out:
- local_irq_restore(flags);
+ raw_local_irq_restore(flags);
return now;
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e954ae3d82c0..1d67464ed95e 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -230,7 +230,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
FILTER_OTHER
);
-FTRACE_ENTRY(print, print_entry,
+FTRACE_ENTRY_REG(print, print_entry,
TRACE_PRINT,
@@ -242,7 +242,9 @@ FTRACE_ENTRY(print, print_entry,
F_printk("%ps: %s",
(void *)__entry->ip, __entry->buf),
- FILTER_OTHER
+ FILTER_OTHER,
+
+ ftrace_event_register
);
FTRACE_ENTRY(raw_data, raw_data_entry,
@@ -356,7 +358,7 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
__field( unsigned int, seqnum )
),
- F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
+ F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n",
__entry->seqnum,
__entry->tv_sec,
__entry->tv_nsec,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 55d6dff37daf..c79193e598f5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
#include "trace.h"
+#include "trace_probe.h"
static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
@@ -237,6 +238,111 @@ void perf_trace_destroy(struct perf_event *p_event)
mutex_unlock(&event_mutex);
}
+#ifdef CONFIG_KPROBE_EVENTS
+int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
+{
+ int ret;
+ char *func = NULL;
+ struct trace_event_call *tp_event;
+
+ if (p_event->attr.kprobe_func) {
+ func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
+ if (!func)
+ return -ENOMEM;
+ ret = strncpy_from_user(
+ func, u64_to_user_ptr(p_event->attr.kprobe_func),
+ KSYM_NAME_LEN);
+ if (ret == KSYM_NAME_LEN)
+ ret = -E2BIG;
+ if (ret < 0)
+ goto out;
+
+ if (func[0] == '\0') {
+ kfree(func);
+ func = NULL;
+ }
+ }
+
+ tp_event = create_local_trace_kprobe(
+ func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
+ p_event->attr.probe_offset, is_retprobe);
+ if (IS_ERR(tp_event)) {
+ ret = PTR_ERR(tp_event);
+ goto out;
+ }
+
+ ret = perf_trace_event_init(tp_event, p_event);
+ if (ret)
+ destroy_local_trace_kprobe(tp_event);
+out:
+ kfree(func);
+ return ret;
+}
+
+void perf_kprobe_destroy(struct perf_event *p_event)
+{
+ perf_trace_event_close(p_event);
+ perf_trace_event_unreg(p_event);
+
+ destroy_local_trace_kprobe(p_event->tp_event);
+}
+#endif /* CONFIG_KPROBE_EVENTS */
+
+#ifdef CONFIG_UPROBE_EVENTS
+int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
+{
+ int ret;
+ char *path = NULL;
+ struct trace_event_call *tp_event;
+
+ if (!p_event->attr.uprobe_path)
+ return -EINVAL;
+ path = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!path)
+ return -ENOMEM;
+ ret = strncpy_from_user(
+ path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
+ if (ret == PATH_MAX)
+ return -E2BIG;
+ if (ret < 0)
+ goto out;
+ if (path[0] == '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ tp_event = create_local_trace_uprobe(
+ path, p_event->attr.probe_offset, is_retprobe);
+ if (IS_ERR(tp_event)) {
+ ret = PTR_ERR(tp_event);
+ goto out;
+ }
+
+ /*
+ * local trace_uprobe need to hold event_mutex to call
+ * uprobe_buffer_enable() and uprobe_buffer_disable().
+ * event_mutex is not required for local trace_kprobes.
+ */
+ mutex_lock(&event_mutex);
+ ret = perf_trace_event_init(tp_event, p_event);
+ if (ret)
+ destroy_local_trace_uprobe(tp_event);
+ mutex_unlock(&event_mutex);
+out:
+ kfree(path);
+ return ret;
+}
+
+void perf_uprobe_destroy(struct perf_event *p_event)
+{
+ mutex_lock(&event_mutex);
+ perf_trace_event_close(p_event);
+ perf_trace_event_unreg(p_event);
+ mutex_unlock(&event_mutex);
+ destroy_local_trace_uprobe(p_event->tp_event);
+}
+#endif /* CONFIG_UPROBE_EVENTS */
+
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05c7172c6667..14ff4ff3caab 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2007,16 +2007,18 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
return -1;
}
}
- trace_create_file("filter", 0644, file->dir, file,
- &ftrace_event_filter_fops);
/*
* Only event directories that can be enabled should have
- * triggers.
+ * triggers or filters.
*/
- if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
+ trace_create_file("filter", 0644, file->dir, file,
+ &ftrace_event_filter_fops);
+
trace_create_file("trigger", 0644, file->dir, file,
&event_trigger_fops);
+ }
#ifdef CONFIG_HIST_TRIGGERS
trace_create_file("hist", 0444, file->dir, file,
@@ -2473,8 +2475,9 @@ __trace_add_event_dirs(struct trace_array *tr)
}
}
+/* Returns any file that matches the system and event */
struct trace_event_file *
-find_event_file(struct trace_array *tr, const char *system, const char *event)
+__find_event_file(struct trace_array *tr, const char *system, const char *event)
{
struct trace_event_file *file;
struct trace_event_call *call;
@@ -2485,10 +2488,7 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
call = file->event_call;
name = trace_event_name(call);
- if (!name || !call->class || !call->class->reg)
- continue;
-
- if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+ if (!name || !call->class)
continue;
if (strcmp(event, name) == 0 &&
@@ -2498,6 +2498,20 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
return NULL;
}
+/* Returns valid trace event files that match system and event */
+struct trace_event_file *
+find_event_file(struct trace_array *tr, const char *system, const char *event)
+{
+ struct trace_event_file *file;
+
+ file = __find_event_file(tr, system, event);
+ if (!file || !file->event_call->class->reg ||
+ file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+ return NULL;
+
+ return file;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE
/* Avoid typos */
@@ -3132,7 +3146,7 @@ static __init int event_trace_enable_again(void)
early_initcall(event_trace_enable_again);
-static __init int event_trace_init(void)
+__init int event_trace_init(void)
{
struct trace_array *tr;
struct dentry *d_tracer;
@@ -3177,8 +3191,6 @@ void __init trace_event_init(void)
event_trace_enable();
}
-fs_initcall(event_trace_init);
-
#ifdef CONFIG_FTRACE_STARTUP_TEST
static DEFINE_SPINLOCK(test_spinlock);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a764aec3c9a1..0171407d231f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -33,163 +33,595 @@
"# Only events with the given fields will be affected.\n" \
"# If no events are modified, an error message will be displayed here"
-enum filter_op_ids
-{
- OP_OR,
- OP_AND,
- OP_GLOB,
- OP_NE,
- OP_EQ,
- OP_LT,
- OP_LE,
- OP_GT,
- OP_GE,
- OP_BAND,
- OP_NOT,
- OP_NONE,
- OP_OPEN_PAREN,
-};
+/* Due to token parsing '<=' must be before '<' and '>=' must be before '>' */
+#define OPS \
+ C( OP_GLOB, "~" ), \
+ C( OP_NE, "!=" ), \
+ C( OP_EQ, "==" ), \
+ C( OP_LE, "<=" ), \
+ C( OP_LT, "<" ), \
+ C( OP_GE, ">=" ), \
+ C( OP_GT, ">" ), \
+ C( OP_BAND, "&" ), \
+ C( OP_MAX, NULL )
-struct filter_op {
- int id;
- char *string;
- int precedence;
-};
+#undef C
+#define C(a, b) a
-/* Order must be the same as enum filter_op_ids above */
-static struct filter_op filter_ops[] = {
- { OP_OR, "||", 1 },
- { OP_AND, "&&", 2 },
- { OP_GLOB, "~", 4 },
- { OP_NE, "!=", 4 },
- { OP_EQ, "==", 4 },
- { OP_LT, "<", 5 },
- { OP_LE, "<=", 5 },
- { OP_GT, ">", 5 },
- { OP_GE, ">=", 5 },
- { OP_BAND, "&", 6 },
- { OP_NOT, "!", 6 },
- { OP_NONE, "OP_NONE", 0 },
- { OP_OPEN_PAREN, "(", 0 },
-};
+enum filter_op_ids { OPS };
-enum {
- FILT_ERR_NONE,
- FILT_ERR_INVALID_OP,
- FILT_ERR_UNBALANCED_PAREN,
- FILT_ERR_TOO_MANY_OPERANDS,
- FILT_ERR_OPERAND_TOO_LONG,
- FILT_ERR_FIELD_NOT_FOUND,
- FILT_ERR_ILLEGAL_FIELD_OP,
- FILT_ERR_ILLEGAL_INTVAL,
- FILT_ERR_BAD_SUBSYS_FILTER,
- FILT_ERR_TOO_MANY_PREDS,
- FILT_ERR_MISSING_FIELD,
- FILT_ERR_INVALID_FILTER,
- FILT_ERR_IP_FIELD_ONLY,
- FILT_ERR_ILLEGAL_NOT_OP,
-};
+#undef C
+#define C(a, b) b
-static char *err_text[] = {
- "No error",
- "Invalid operator",
- "Unbalanced parens",
- "Too many operands",
- "Operand too long",
- "Field not found",
- "Illegal operation for field type",
- "Illegal integer value",
- "Couldn't find or set field in one of a subsystem's events",
- "Too many terms in predicate expression",
- "Missing field name and/or value",
- "Meaningless filter expression",
- "Only 'ip' field is supported for function trace",
- "Illegal use of '!'",
-};
+static const char * ops[] = { OPS };
-struct opstack_op {
- enum filter_op_ids op;
- struct list_head list;
-};
+/*
+ * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
+ * pred_funcs_##type below must match the order of them above.
+ */
+#define PRED_FUNC_START OP_LE
+#define PRED_FUNC_MAX (OP_BAND - PRED_FUNC_START)
+
+#define ERRORS \
+ C(NONE, "No error"), \
+ C(INVALID_OP, "Invalid operator"), \
+ C(TOO_MANY_OPEN, "Too many '('"), \
+ C(TOO_MANY_CLOSE, "Too few '('"), \
+ C(MISSING_QUOTE, "Missing matching quote"), \
+ C(OPERAND_TOO_LONG, "Operand too long"), \
+ C(EXPECT_STRING, "Expecting string field"), \
+ C(EXPECT_DIGIT, "Expecting numeric field"), \
+ C(ILLEGAL_FIELD_OP, "Illegal operation for field type"), \
+ C(FIELD_NOT_FOUND, "Field not found"), \
+ C(ILLEGAL_INTVAL, "Illegal integer value"), \
+ C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \
+ C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \
+ C(INVALID_FILTER, "Meaningless filter expression"), \
+ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
+ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"),
+
+#undef C
+#define C(a, b) FILT_ERR_##a
+
+enum { ERRORS };
+
+#undef C
+#define C(a, b) b
+
+static char *err_text[] = { ERRORS };
+
+/* Called after a '!' character but "!=" and "!~" are not "not"s */
+static bool is_not(const char *str)
+{
+ switch (str[1]) {
+ case '=':
+ case '~':
+ return false;
+ }
+ return true;
+}
-struct postfix_elt {
- enum filter_op_ids op;
- char *operand;
- struct list_head list;
+/**
+ * prog_entry - a singe entry in the filter program
+ * @target: Index to jump to on a branch (actually one minus the index)
+ * @when_to_branch: The value of the result of the predicate to do a branch
+ * @pred: The predicate to execute.
+ */
+struct prog_entry {
+ int target;
+ int when_to_branch;
+ struct filter_pred *pred;
};
-struct filter_parse_state {
- struct filter_op *ops;
- struct list_head opstack;
- struct list_head postfix;
+/**
+ * update_preds- assign a program entry a label target
+ * @prog: The program array
+ * @N: The index of the current entry in @prog
+ * @when_to_branch: What to assign a program entry for its branch condition
+ *
+ * The program entry at @N has a target that points to the index of a program
+ * entry that can have its target and when_to_branch fields updated.
+ * Update the current program entry denoted by index @N target field to be
+ * that of the updated entry. This will denote the entry to update if
+ * we are processing an "||" after an "&&"
+ */
+static void update_preds(struct prog_entry *prog, int N, int invert)
+{
+ int t, s;
+
+ t = prog[N].target;
+ s = prog[t].target;
+ prog[t].when_to_branch = invert;
+ prog[t].target = N;
+ prog[N].target = s;
+}
+
+struct filter_parse_error {
int lasterr;
int lasterr_pos;
-
- struct {
- char *string;
- unsigned int cnt;
- unsigned int tail;
- } infix;
-
- struct {
- char string[MAX_FILTER_STR_VAL];
- int pos;
- unsigned int tail;
- } operand;
};
-struct pred_stack {
- struct filter_pred **preds;
- int index;
+static void parse_error(struct filter_parse_error *pe, int err, int pos)
+{
+ pe->lasterr = err;
+ pe->lasterr_pos = pos;
+}
+
+typedef int (*parse_pred_fn)(const char *str, void *data, int pos,
+ struct filter_parse_error *pe,
+ struct filter_pred **pred);
+
+enum {
+ INVERT = 1,
+ PROCESS_AND = 2,
+ PROCESS_OR = 4,
};
-/* If not of not match is equal to not of not, then it is a match */
+/*
+ * Without going into a formal proof, this explains the method that is used in
+ * parsing the logical expressions.
+ *
+ * For example, if we have: "a && !(!b || (c && g)) || d || e && !f"
+ * The first pass will convert it into the following program:
+ *
+ * n1: r=a; l1: if (!r) goto l4;
+ * n2: r=b; l2: if (!r) goto l4;
+ * n3: r=c; r=!r; l3: if (r) goto l4;
+ * n4: r=g; r=!r; l4: if (r) goto l5;
+ * n5: r=d; l5: if (r) goto T
+ * n6: r=e; l6: if (!r) goto l7;
+ * n7: r=f; r=!r; l7: if (!r) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * To do this, we use a data structure to represent each of the above
+ * predicate and conditions that has:
+ *
+ * predicate, when_to_branch, invert, target
+ *
+ * The "predicate" will hold the function to determine the result "r".
+ * The "when_to_branch" denotes what "r" should be if a branch is to be taken
+ * "&&" would contain "!r" or (0) and "||" would contain "r" or (1).
+ * The "invert" holds whether the value should be reversed before testing.
+ * The "target" contains the label "l#" to jump to.
+ *
+ * A stack is created to hold values when parentheses are used.
+ *
+ * To simplify the logic, the labels will start at 0 and not 1.
+ *
+ * The possible invert values are 1 and 0. The number of "!"s that are in scope
+ * before the predicate determines the invert value, if the number is odd then
+ * the invert value is 1 and 0 otherwise. This means the invert value only
+ * needs to be toggled when a new "!" is introduced compared to what is stored
+ * on the stack, where parentheses were used.
+ *
+ * The top of the stack and "invert" are initialized to zero.
+ *
+ * ** FIRST PASS **
+ *
+ * #1 A loop through all the tokens is done:
+ *
+ * #2 If the token is an "(", the stack is push, and the current stack value
+ * gets the current invert value, and the loop continues to the next token.
+ * The top of the stack saves the "invert" value to keep track of what
+ * the current inversion is. As "!(a && !b || c)" would require all
+ * predicates being affected separately by the "!" before the parentheses.
+ * And that would end up being equivalent to "(!a || b) && !c"
+ *
+ * #3 If the token is an "!", the current "invert" value gets inverted, and
+ * the loop continues. Note, if the next token is a predicate, then
+ * this "invert" value is only valid for the current program entry,
+ * and does not affect other predicates later on.
+ *
+ * The only other acceptable token is the predicate string.
+ *
+ * #4 A new entry into the program is added saving: the predicate and the
+ * current value of "invert". The target is currently assigned to the
+ * previous program index (this will not be its final value).
+ *
+ * #5 We now enter another loop and look at the next token. The only valid
+ * tokens are ")", "&&", "||" or end of the input string "\0".
+ *
+ * #6 The invert variable is reset to the current value saved on the top of
+ * the stack.
+ *
+ * #7 The top of the stack holds not only the current invert value, but also
+ * if a "&&" or "||" needs to be processed. Note, the "&&" takes higher
+ * precedence than "||". That is "a && b || c && d" is equivalent to
+ * "(a && b) || (c && d)". Thus the first thing to do is to see if "&&" needs
+ * to be processed. This is the case if an "&&" was the last token. If it was
+ * then we call update_preds(). This takes the program, the current index in
+ * the program, and the current value of "invert". More will be described
+ * below about this function.
+ *
+ * #8 If the next token is "&&" then we set a flag in the top of the stack
+ * that denotes that "&&" needs to be processed, break out of this loop
+ * and continue with the outer loop.
+ *
+ * #9 Otherwise, if a "||" needs to be processed then update_preds() is called.
+ * This is called with the program, the current index in the program, but
+ * this time with an inverted value of "invert" (that is !invert). This is
+ * because the value taken will become the "when_to_branch" value of the
+ * program.
+ * Note, this is called when the next token is not an "&&". As stated before,
+ * "&&" takes higher precedence, and "||" should not be processed yet if the
+ * next logical operation is "&&".
+ *
+ * #10 If the next token is "||" then we set a flag in the top of the stack
+ * that denotes that "||" needs to be processed, break out of this loop
+ * and continue with the outer loop.
+ *
+ * #11 If this is the end of the input string "\0" then we break out of both
+ * loops.
+ *
+ * #12 Otherwise, the next token is ")", where we pop the stack and continue
+ * this inner loop.
+ *
+ * Now to discuss the update_pred() function, as that is key to the setting up
+ * of the program. Remember the "target" of the program is initialized to the
+ * previous index and not the "l" label. The target holds the index into the
+ * program that gets affected by the operand. Thus if we have something like
+ * "a || b && c", when we process "a" the target will be "-1" (undefined).
+ * When we process "b", its target is "0", which is the index of "a", as that's
+ * the predicate that is affected by "||". But because the next token after "b"
+ * is "&&" we don't call update_preds(). Instead continue to "c". As the
+ * next token after "c" is not "&&" but the end of input, we first process the
+ * "&&" by calling update_preds() for the "&&" then we process the "||" by
+ * callin updates_preds() with the values for processing "||".
+ *
+ * What does that mean? What update_preds() does is to first save the "target"
+ * of the program entry indexed by the current program entry's "target"
+ * (remember the "target" is initialized to previous program entry), and then
+ * sets that "target" to the current index which represents the label "l#".
+ * That entry's "when_to_branch" is set to the value passed in (the "invert"
+ * or "!invert"). Then it sets the current program entry's target to the saved
+ * "target" value (the old value of the program that had its "target" updated
+ * to the label).
+ *
+ * Looking back at "a || b && c", we have the following steps:
+ * "a" - prog[0] = { "a", X, -1 } // pred, when_to_branch, target
+ * "||" - flag that we need to process "||"; continue outer loop
+ * "b" - prog[1] = { "b", X, 0 }
+ * "&&" - flag that we need to process "&&"; continue outer loop
+ * (Notice we did not process "||")
+ * "c" - prog[2] = { "c", X, 1 }
+ * update_preds(prog, 2, 0); // invert = 0 as we are processing "&&"
+ * t = prog[2].target; // t = 1
+ * s = prog[t].target; // s = 0
+ * prog[t].target = 2; // Set target to "l2"
+ * prog[t].when_to_branch = 0;
+ * prog[2].target = s;
+ * update_preds(prog, 2, 1); // invert = 1 as we are now processing "||"
+ * t = prog[2].target; // t = 0
+ * s = prog[t].target; // s = -1
+ * prog[t].target = 2; // Set target to "l2"
+ * prog[t].when_to_branch = 1;
+ * prog[2].target = s;
+ *
+ * #13 Which brings us to the final step of the first pass, which is to set
+ * the last program entry's when_to_branch and target, which will be
+ * when_to_branch = 0; target = N; ( the label after the program entry after
+ * the last program entry processed above).
+ *
+ * If we denote "TRUE" to be the entry after the last program entry processed,
+ * and "FALSE" the program entry after that, we are now done with the first
+ * pass.
+ *
+ * Making the above "a || b && c" have a progam of:
+ * prog[0] = { "a", 1, 2 }
+ * prog[1] = { "b", 0, 2 }
+ * prog[2] = { "c", 0, 3 }
+ *
+ * Which translates into:
+ * n0: r = a; l0: if (r) goto l2;
+ * n1: r = b; l1: if (!r) goto l2;
+ * n2: r = c; l2: if (!r) goto l3; // Which is the same as "goto F;"
+ * T: return TRUE; l3:
+ * F: return FALSE
+ *
+ * Although, after the first pass, the program is correct, it is
+ * inefficient. The simple sample of "a || b && c" could be easily been
+ * converted into:
+ * n0: r = a; if (r) goto T
+ * n1: r = b; if (!r) goto F
+ * n2: r = c; if (!r) goto F
+ * T: return TRUE;
+ * F: return FALSE;
+ *
+ * The First Pass is over the input string. The next too passes are over
+ * the program itself.
+ *
+ * ** SECOND PASS **
+ *
+ * Which brings us to the second pass. If a jump to a label has the
+ * same condition as that label, it can instead jump to its target.
+ * The original example of "a && !(!b || (c && g)) || d || e && !f"
+ * where the first pass gives us:
+ *
+ * n1: r=a; l1: if (!r) goto l4;
+ * n2: r=b; l2: if (!r) goto l4;
+ * n3: r=c; r=!r; l3: if (r) goto l4;
+ * n4: r=g; r=!r; l4: if (r) goto l5;
+ * n5: r=d; l5: if (r) goto T
+ * n6: r=e; l6: if (!r) goto l7;
+ * n7: r=f; r=!r; l7: if (!r) goto F:
+ * T: return TRUE;
+ * F: return FALSE
+ *
+ * We can see that "l3: if (r) goto l4;" and at l4, we have "if (r) goto l5;".
+ * And "l5: if (r) goto T", we could optimize this by converting l3 and l4
+ * to go directly to T. To accomplish this, we start from the last
+ * entry in the program and work our way back. If the target of the entry
+ * has the same "when_to_branch" then we could use that entry's target.
+ * Doing this, the above would end up as:
+ *
+ * n1: r=a; l1: if (!r) goto l4;
+ * n2: r=b; l2: if (!r) goto l4;
+ * n3: r=c; r=!r; l3: if (r) goto T;
+ * n4: r=g; r=!r; l4: if (r) goto T;
+ * n5: r=d; l5: if (r) goto T;
+ * n6: r=e; l6: if (!r) goto F;
+ * n7: r=f; r=!r; l7: if (!r) goto F;
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * In that same pass, if the "when_to_branch" doesn't match, we can simply
+ * go to the program entry after the label. That is, "l2: if (!r) goto l4;"
+ * where "l4: if (r) goto T;", then we can convert l2 to be:
+ * "l2: if (!r) goto n5;".
+ *
+ * This will have the second pass give us:
+ * n1: r=a; l1: if (!r) goto n5;
+ * n2: r=b; l2: if (!r) goto n5;
+ * n3: r=c; r=!r; l3: if (r) goto T;
+ * n4: r=g; r=!r; l4: if (r) goto T;
+ * n5: r=d; l5: if (r) goto T
+ * n6: r=e; l6: if (!r) goto F;
+ * n7: r=f; r=!r; l7: if (!r) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * Notice, all the "l#" labels are no longer used, and they can now
+ * be discarded.
+ *
+ * ** THIRD PASS **
+ *
+ * For the third pass we deal with the inverts. As they simply just
+ * make the "when_to_branch" get inverted, a simple loop over the
+ * program to that does: "when_to_branch ^= invert;" will do the
+ * job, leaving us with:
+ * n1: r=a; if (!r) goto n5;
+ * n2: r=b; if (!r) goto n5;
+ * n3: r=c: if (!r) goto T;
+ * n4: r=g; if (!r) goto T;
+ * n5: r=d; if (r) goto T
+ * n6: r=e; if (!r) goto F;
+ * n7: r=f; if (r) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * As "r = a; if (!r) goto n5;" is obviously the same as
+ * "if (!a) goto n5;" without doing anything we can interperate the
+ * program as:
+ * n1: if (!a) goto n5;
+ * n2: if (!b) goto n5;
+ * n3: if (!c) goto T;
+ * n4: if (!g) goto T;
+ * n5: if (d) goto T
+ * n6: if (!e) goto F;
+ * n7: if (f) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * Since the inverts are discarded at the end, there's no reason to store
+ * them in the program array (and waste memory). A separate array to hold
+ * the inverts is used and freed at the end.
+ */
+static struct prog_entry *
+predicate_parse(const char *str, int nr_parens, int nr_preds,
+ parse_pred_fn parse_pred, void *data,
+ struct filter_parse_error *pe)
+{
+ struct prog_entry *prog_stack;
+ struct prog_entry *prog;
+ const char *ptr = str;
+ char *inverts = NULL;
+ int *op_stack;
+ int *top;
+ int invert = 0;
+ int ret = -ENOMEM;
+ int len;
+ int N = 0;
+ int i;
+
+ nr_preds += 2; /* For TRUE and FALSE */
+
+ op_stack = kmalloc(sizeof(*op_stack) * nr_parens, GFP_KERNEL);
+ if (!op_stack)
+ return ERR_PTR(-ENOMEM);
+ prog_stack = kmalloc(sizeof(*prog_stack) * nr_preds, GFP_KERNEL);
+ if (!prog_stack) {
+ parse_error(pe, -ENOMEM, 0);
+ goto out_free;
+ }
+ inverts = kmalloc(sizeof(*inverts) * nr_preds, GFP_KERNEL);
+ if (!inverts) {
+ parse_error(pe, -ENOMEM, 0);
+ goto out_free;
+ }
+
+ top = op_stack;
+ prog = prog_stack;
+ *top = 0;
+
+ /* First pass */
+ while (*ptr) { /* #1 */
+ const char *next = ptr++;
+
+ if (isspace(*next))
+ continue;
+
+ switch (*next) {
+ case '(': /* #2 */
+ if (top - op_stack > nr_parens)
+ return ERR_PTR(-EINVAL);
+ *(++top) = invert;
+ continue;
+ case '!': /* #3 */
+ if (!is_not(next))
+ break;
+ invert = !invert;
+ continue;
+ }
+
+ if (N >= nr_preds) {
+ parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str);
+ goto out_free;
+ }
+
+ inverts[N] = invert; /* #4 */
+ prog[N].target = N-1;
+
+ len = parse_pred(next, data, ptr - str, pe, &prog[N].pred);
+ if (len < 0) {
+ ret = len;
+ goto out_free;
+ }
+ ptr = next + len;
+
+ N++;
+
+ ret = -1;
+ while (1) { /* #5 */
+ next = ptr++;
+ if (isspace(*next))
+ continue;
+
+ switch (*next) {
+ case ')':
+ case '\0':
+ break;
+ case '&':
+ case '|':
+ if (next[1] == next[0]) {
+ ptr++;
+ break;
+ }
+ default:
+ parse_error(pe, FILT_ERR_TOO_MANY_PREDS,
+ next - str);
+ goto out_free;
+ }
+
+ invert = *top & INVERT;
+
+ if (*top & PROCESS_AND) { /* #7 */
+ update_preds(prog, N - 1, invert);
+ *top &= ~PROCESS_AND;
+ }
+ if (*next == '&') { /* #8 */
+ *top |= PROCESS_AND;
+ break;
+ }
+ if (*top & PROCESS_OR) { /* #9 */
+ update_preds(prog, N - 1, !invert);
+ *top &= ~PROCESS_OR;
+ }
+ if (*next == '|') { /* #10 */
+ *top |= PROCESS_OR;
+ break;
+ }
+ if (!*next) /* #11 */
+ goto out;
+
+ if (top == op_stack) {
+ ret = -1;
+ /* Too few '(' */
+ parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, ptr - str);
+ goto out_free;
+ }
+ top--; /* #12 */
+ }
+ }
+ out:
+ if (top != op_stack) {
+ /* Too many '(' */
+ parse_error(pe, FILT_ERR_TOO_MANY_OPEN, ptr - str);
+ goto out_free;
+ }
+
+ prog[N].pred = NULL; /* #13 */
+ prog[N].target = 1; /* TRUE */
+ prog[N+1].pred = NULL;
+ prog[N+1].target = 0; /* FALSE */
+ prog[N-1].target = N;
+ prog[N-1].when_to_branch = false;
+
+ /* Second Pass */
+ for (i = N-1 ; i--; ) {
+ int target = prog[i].target;
+ if (prog[i].when_to_branch == prog[target].when_to_branch)
+ prog[i].target = prog[target].target;
+ }
+
+ /* Third Pass */
+ for (i = 0; i < N; i++) {
+ invert = inverts[i] ^ prog[i].when_to_branch;
+ prog[i].when_to_branch = invert;
+ /* Make sure the program always moves forward */
+ if (WARN_ON(prog[i].target <= i)) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+ }
+
+ return prog;
+out_free:
+ kfree(op_stack);
+ kfree(prog_stack);
+ kfree(inverts);
+ return ERR_PTR(ret);
+}
+
#define DEFINE_COMPARISON_PRED(type) \
static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = (*addr < val); \
- return !!match == !pred->not; \
+ return *addr < val; \
} \
static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = (*addr <= val); \
- return !!match == !pred->not; \
+ return *addr <= val; \
} \
static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = (*addr > val); \
- return !!match == !pred->not; \
+ return *addr > val; \
} \
static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = (*addr >= val); \
- return !!match == !pred->not; \
+ return *addr >= val; \
} \
static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
- int match = !!(*addr & val); \
- return match == !pred->not; \
+ return !!(*addr & val); \
} \
static const filter_pred_fn_t pred_funcs_##type[] = { \
- filter_pred_LT_##type, \
filter_pred_LE_##type, \
- filter_pred_GT_##type, \
+ filter_pred_LT_##type, \
filter_pred_GE_##type, \
+ filter_pred_GT_##type, \
filter_pred_BAND_##type, \
};
-#define PRED_FUNC_START OP_LT
-
#define DEFINE_EQUALITY_PRED(size) \
static int filter_pred_##size(struct filter_pred *pred, void *event) \
{ \
@@ -272,44 +704,36 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
static int filter_pred_cpu(struct filter_pred *pred, void *event)
{
int cpu, cmp;
- int match = 0;
cpu = raw_smp_processor_id();
cmp = pred->val;
switch (pred->op) {
case OP_EQ:
- match = cpu == cmp;
- break;
+ return cpu == cmp;
+ case OP_NE:
+ return cpu != cmp;
case OP_LT:
- match = cpu < cmp;
- break;
+ return cpu < cmp;
case OP_LE:
- match = cpu <= cmp;
- break;
+ return cpu <= cmp;
case OP_GT:
- match = cpu > cmp;
- break;
+ return cpu > cmp;
case OP_GE:
- match = cpu >= cmp;
- break;
+ return cpu >= cmp;
default:
- break;
+ return 0;
}
-
- return !!match == !pred->not;
}
/* Filter predicate for COMM. */
static int filter_pred_comm(struct filter_pred *pred, void *event)
{
- int cmp, match;
+ int cmp;
cmp = pred->regex.match(current->comm, &pred->regex,
- pred->regex.field_len);
- match = cmp ^ pred->not;
-
- return match;
+ TASK_COMM_LEN);
+ return cmp ^ pred->not;
}
static int filter_pred_none(struct filter_pred *pred, void *event)
@@ -326,28 +750,32 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
*
* Note:
* - @str might not be NULL-terminated if it's of type DYN_STRING
- * or STATIC_STRING
+ * or STATIC_STRING, unless @len is zero.
*/
static int regex_match_full(char *str, struct regex *r, int len)
{
- if (strncmp(str, r->pattern, len) == 0)
- return 1;
- return 0;
+ /* len of zero means str is dynamic and ends with '\0' */
+ if (!len)
+ return strcmp(str, r->pattern) == 0;
+
+ return strncmp(str, r->pattern, len) == 0;
}
static int regex_match_front(char *str, struct regex *r, int len)
{
- if (strncmp(str, r->pattern, r->len) == 0)
- return 1;
- return 0;
+ if (len && len < r->len)
+ return 0;
+
+ return strncmp(str, r->pattern, r->len) == 0;
}
static int regex_match_middle(char *str, struct regex *r, int len)
{
- if (strnstr(str, r->pattern, len))
- return 1;
- return 0;
+ if (!len)
+ return strstr(str, r->pattern) != NULL;
+
+ return strnstr(str, r->pattern, len) != NULL;
}
static int regex_match_end(char *str, struct regex *r, int len)
@@ -366,6 +794,7 @@ static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused)
return 1;
return 0;
}
+
/**
* filter_parse_regex - parse a basic regex
* @buff: the raw regex
@@ -426,10 +855,9 @@ static void filter_build_regex(struct filter_pred *pred)
struct regex *r = &pred->regex;
char *search;
enum regex_type type = MATCH_FULL;
- int not = 0;
if (pred->op == OP_GLOB) {
- type = filter_parse_regex(r->pattern, r->len, &search, &not);
+ type = filter_parse_regex(r->pattern, r->len, &search, &pred->not);
r->len = strlen(search);
memmove(r->pattern, search, r->len+1);
}
@@ -451,210 +879,32 @@ static void filter_build_regex(struct filter_pred *pred)
r->match = regex_match_glob;
break;
}
-
- pred->not ^= not;
-}
-
-enum move_type {
- MOVE_DOWN,
- MOVE_UP_FROM_LEFT,
- MOVE_UP_FROM_RIGHT
-};
-
-static struct filter_pred *
-get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
- int index, enum move_type *move)
-{
- if (pred->parent & FILTER_PRED_IS_RIGHT)
- *move = MOVE_UP_FROM_RIGHT;
- else
- *move = MOVE_UP_FROM_LEFT;
- pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
-
- return pred;
-}
-
-enum walk_return {
- WALK_PRED_ABORT,
- WALK_PRED_PARENT,
- WALK_PRED_DEFAULT,
-};
-
-typedef int (*filter_pred_walkcb_t) (enum move_type move,
- struct filter_pred *pred,
- int *err, void *data);
-
-static int walk_pred_tree(struct filter_pred *preds,
- struct filter_pred *root,
- filter_pred_walkcb_t cb, void *data)
-{
- struct filter_pred *pred = root;
- enum move_type move = MOVE_DOWN;
- int done = 0;
-
- if (!preds)
- return -EINVAL;
-
- do {
- int err = 0, ret;
-
- ret = cb(move, pred, &err, data);
- if (ret == WALK_PRED_ABORT)
- return err;
- if (ret == WALK_PRED_PARENT)
- goto get_parent;
-
- switch (move) {
- case MOVE_DOWN:
- if (pred->left != FILTER_PRED_INVALID) {
- pred = &preds[pred->left];
- continue;
- }
- goto get_parent;
- case MOVE_UP_FROM_LEFT:
- pred = &preds[pred->right];
- move = MOVE_DOWN;
- continue;
- case MOVE_UP_FROM_RIGHT:
- get_parent:
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent,
- &move);
- continue;
- }
- done = 1;
- } while (!done);
-
- /* We are fine. */
- return 0;
-}
-
-/*
- * A series of AND or ORs where found together. Instead of
- * climbing up and down the tree branches, an array of the
- * ops were made in order of checks. We can just move across
- * the array and short circuit if needed.
- */
-static int process_ops(struct filter_pred *preds,
- struct filter_pred *op, void *rec)
-{
- struct filter_pred *pred;
- int match = 0;
- int type;
- int i;
-
- /*
- * Micro-optimization: We set type to true if op
- * is an OR and false otherwise (AND). Then we
- * just need to test if the match is equal to
- * the type, and if it is, we can short circuit the
- * rest of the checks:
- *
- * if ((match && op->op == OP_OR) ||
- * (!match && op->op == OP_AND))
- * return match;
- */
- type = op->op == OP_OR;
-
- for (i = 0; i < op->val; i++) {
- pred = &preds[op->ops[i]];
- if (!WARN_ON_ONCE(!pred->fn))
- match = pred->fn(pred, rec);
- if (!!match == type)
- break;
- }
- /* If not of not match is equal to not of not, then it is a match */
- return !!match == !op->not;
-}
-
-struct filter_match_preds_data {
- struct filter_pred *preds;
- int match;
- void *rec;
-};
-
-static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
-{
- struct filter_match_preds_data *d = data;
-
- *err = 0;
- switch (move) {
- case MOVE_DOWN:
- /* only AND and OR have children */
- if (pred->left != FILTER_PRED_INVALID) {
- /* If ops is set, then it was folded. */
- if (!pred->ops)
- return WALK_PRED_DEFAULT;
- /* We can treat folded ops as a leaf node */
- d->match = process_ops(d->preds, pred, d->rec);
- } else {
- if (!WARN_ON_ONCE(!pred->fn))
- d->match = pred->fn(pred, d->rec);
- }
-
- return WALK_PRED_PARENT;
- case MOVE_UP_FROM_LEFT:
- /*
- * Check for short circuits.
- *
- * Optimization: !!match == (pred->op == OP_OR)
- * is the same as:
- * if ((match && pred->op == OP_OR) ||
- * (!match && pred->op == OP_AND))
- */
- if (!!d->match == (pred->op == OP_OR))
- return WALK_PRED_PARENT;
- break;
- case MOVE_UP_FROM_RIGHT:
- break;
- }
-
- return WALK_PRED_DEFAULT;
}
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct event_filter *filter, void *rec)
{
- struct filter_pred *preds;
- struct filter_pred *root;
- struct filter_match_preds_data data = {
- /* match is currently meaningless */
- .match = -1,
- .rec = rec,
- };
- int n_preds, ret;
+ struct prog_entry *prog;
+ int i;
/* no filter is considered a match */
if (!filter)
return 1;
- n_preds = filter->n_preds;
- if (!n_preds)
- return 1;
-
- /*
- * n_preds, root and filter->preds are protect with preemption disabled.
- */
- root = rcu_dereference_sched(filter->root);
- if (!root)
+ prog = rcu_dereference_sched(filter->prog);
+ if (!prog)
return 1;
- data.preds = preds = rcu_dereference_sched(filter->preds);
- ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
- WARN_ON(ret);
- return data.match;
+ for (i = 0; prog[i].pred; i++) {
+ struct filter_pred *pred = prog[i].pred;
+ int match = pred->fn(pred, rec);
+ if (match == prog[i].when_to_branch)
+ i = prog[i].target;
+ }
+ return prog[i].target;
}
EXPORT_SYMBOL_GPL(filter_match_preds);
-static void parse_error(struct filter_parse_state *ps, int err, int pos)
-{
- ps->lasterr = err;
- ps->lasterr_pos = pos;
-}
-
static void remove_filter_string(struct event_filter *filter)
{
if (!filter)
@@ -664,57 +914,44 @@ static void remove_filter_string(struct event_filter *filter)
filter->filter_string = NULL;
}
-static int replace_filter_string(struct event_filter *filter,
- char *filter_string)
-{
- kfree(filter->filter_string);
- filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
- if (!filter->filter_string)
- return -ENOMEM;
-
- return 0;
-}
-
-static int append_filter_string(struct event_filter *filter,
- char *string)
-{
- int newlen;
- char *new_filter_string;
-
- BUG_ON(!filter->filter_string);
- newlen = strlen(filter->filter_string) + strlen(string) + 1;
- new_filter_string = kmalloc(newlen, GFP_KERNEL);
- if (!new_filter_string)
- return -ENOMEM;
-
- strcpy(new_filter_string, filter->filter_string);
- strcat(new_filter_string, string);
- kfree(filter->filter_string);
- filter->filter_string = new_filter_string;
-
- return 0;
-}
-
-static void append_filter_err(struct filter_parse_state *ps,
+static void append_filter_err(struct filter_parse_error *pe,
struct event_filter *filter)
{
- int pos = ps->lasterr_pos;
- char *buf, *pbuf;
+ struct trace_seq *s;
+ int pos = pe->lasterr_pos;
+ char *buf;
+ int len;
+
+ if (WARN_ON(!filter->filter_string))
+ return;
- buf = (char *)__get_free_page(GFP_KERNEL);
- if (!buf)
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
return;
+ trace_seq_init(s);
+
+ len = strlen(filter->filter_string);
+ if (pos > len)
+ pos = len;
- append_filter_string(filter, "\n");
- memset(buf, ' ', PAGE_SIZE);
- if (pos > PAGE_SIZE - 128)
- pos = 0;
- buf[pos] = '^';
- pbuf = &buf[pos] + 1;
+ /* indexing is off by one */
+ if (pos)
+ pos++;
- sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
- append_filter_string(filter, buf);
- free_page((unsigned long) buf);
+ trace_seq_puts(s, filter->filter_string);
+ if (pe->lasterr > 0) {
+ trace_seq_printf(s, "\n%*s", pos, "^");
+ trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]);
+ } else {
+ trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr);
+ }
+ trace_seq_putc(s, 0);
+ buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL);
+ if (buf) {
+ kfree(filter->filter_string);
+ filter->filter_string = buf;
+ }
+ kfree(s);
}
static inline struct event_filter *event_filter(struct trace_event_file *file)
@@ -747,108 +984,18 @@ void print_subsystem_event_filter(struct event_subsystem *system,
mutex_unlock(&event_mutex);
}
-static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
-{
- stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
- if (!stack->preds)
- return -ENOMEM;
- stack->index = n_preds;
- return 0;
-}
-
-static void __free_pred_stack(struct pred_stack *stack)
-{
- kfree(stack->preds);
- stack->index = 0;
-}
-
-static int __push_pred_stack(struct pred_stack *stack,
- struct filter_pred *pred)
-{
- int index = stack->index;
-
- if (WARN_ON(index == 0))
- return -ENOSPC;
-
- stack->preds[--index] = pred;
- stack->index = index;
- return 0;
-}
-
-static struct filter_pred *
-__pop_pred_stack(struct pred_stack *stack)
-{
- struct filter_pred *pred;
- int index = stack->index;
-
- pred = stack->preds[index++];
- if (!pred)
- return NULL;
-
- stack->index = index;
- return pred;
-}
-
-static int filter_set_pred(struct event_filter *filter,
- int idx,
- struct pred_stack *stack,
- struct filter_pred *src)
-{
- struct filter_pred *dest = &filter->preds[idx];
- struct filter_pred *left;
- struct filter_pred *right;
-
- *dest = *src;
- dest->index = idx;
-
- if (dest->op == OP_OR || dest->op == OP_AND) {
- right = __pop_pred_stack(stack);
- left = __pop_pred_stack(stack);
- if (!left || !right)
- return -EINVAL;
- /*
- * If both children can be folded
- * and they are the same op as this op or a leaf,
- * then this op can be folded.
- */
- if (left->index & FILTER_PRED_FOLD &&
- ((left->op == dest->op && !left->not) ||
- left->left == FILTER_PRED_INVALID) &&
- right->index & FILTER_PRED_FOLD &&
- ((right->op == dest->op && !right->not) ||
- right->left == FILTER_PRED_INVALID))
- dest->index |= FILTER_PRED_FOLD;
-
- dest->left = left->index & ~FILTER_PRED_FOLD;
- dest->right = right->index & ~FILTER_PRED_FOLD;
- left->parent = dest->index & ~FILTER_PRED_FOLD;
- right->parent = dest->index | FILTER_PRED_IS_RIGHT;
- } else {
- /*
- * Make dest->left invalid to be used as a quick
- * way to know this is a leaf node.
- */
- dest->left = FILTER_PRED_INVALID;
-
- /* All leafs allow folding the parent ops. */
- dest->index |= FILTER_PRED_FOLD;
- }
-
- return __push_pred_stack(stack, dest);
-}
-
-static void __free_preds(struct event_filter *filter)
+static void free_prog(struct event_filter *filter)
{
+ struct prog_entry *prog;
int i;
- if (filter->preds) {
- for (i = 0; i < filter->n_preds; i++)
- kfree(filter->preds[i].ops);
- kfree(filter->preds);
- filter->preds = NULL;
- }
- filter->a_preds = 0;
- filter->n_preds = 0;
+ prog = rcu_access_pointer(filter->prog);
+ if (!prog)
+ return;
+
+ for (i = 0; prog[i].pred; i++)
+ kfree(prog[i].pred);
+ kfree(prog);
}
static void filter_disable(struct trace_event_file *file)
@@ -866,7 +1013,7 @@ static void __free_filter(struct event_filter *filter)
if (!filter)
return;
- __free_preds(filter);
+ free_prog(filter);
kfree(filter->filter_string);
kfree(filter);
}
@@ -876,38 +1023,6 @@ void free_event_filter(struct event_filter *filter)
__free_filter(filter);
}
-static struct event_filter *__alloc_filter(void)
-{
- struct event_filter *filter;
-
- filter = kzalloc(sizeof(*filter), GFP_KERNEL);
- return filter;
-}
-
-static int __alloc_preds(struct event_filter *filter, int n_preds)
-{
- struct filter_pred *pred;
- int i;
-
- if (filter->preds)
- __free_preds(filter);
-
- filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
-
- if (!filter->preds)
- return -ENOMEM;
-
- filter->a_preds = n_preds;
- filter->n_preds = 0;
-
- for (i = 0; i < n_preds; i++) {
- pred = &filter->preds[i];
- pred->fn = filter_pred_none;
- }
-
- return 0;
-}
-
static inline void __remove_filter(struct trace_event_file *file)
{
filter_disable(file);
@@ -944,27 +1059,6 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
}
}
-static int filter_add_pred(struct filter_parse_state *ps,
- struct event_filter *filter,
- struct filter_pred *pred,
- struct pred_stack *stack)
-{
- int err;
-
- if (WARN_ON(filter->n_preds == filter->a_preds)) {
- parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
- return -ENOSPC;
- }
-
- err = filter_set_pred(filter, filter->n_preds, stack, pred);
- if (err)
- return err;
-
- filter->n_preds++;
-
- return 0;
-}
-
int filter_assign_type(const char *type)
{
if (strstr(type, "__data_loc") && strstr(type, "char"))
@@ -976,761 +1070,449 @@ int filter_assign_type(const char *type)
return FILTER_OTHER;
}
-static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op)
-{
- if (is_string_field(field) &&
- (op != OP_EQ && op != OP_NE && op != OP_GLOB))
- return false;
- if (!is_string_field(field) && op == OP_GLOB)
- return false;
-
- return true;
-}
-
static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
int field_size, int field_is_signed)
{
filter_pred_fn_t fn = NULL;
+ int pred_func_index = -1;
+
+ switch (op) {
+ case OP_EQ:
+ case OP_NE:
+ break;
+ default:
+ if (WARN_ON_ONCE(op < PRED_FUNC_START))
+ return NULL;
+ pred_func_index = op - PRED_FUNC_START;
+ if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
+ return NULL;
+ }
switch (field_size) {
case 8:
- if (op == OP_EQ || op == OP_NE)
+ if (pred_func_index < 0)
fn = filter_pred_64;
else if (field_is_signed)
- fn = pred_funcs_s64[op - PRED_FUNC_START];
+ fn = pred_funcs_s64[pred_func_index];
else
- fn = pred_funcs_u64[op - PRED_FUNC_START];
+ fn = pred_funcs_u64[pred_func_index];
break;
case 4:
- if (op == OP_EQ || op == OP_NE)
+ if (pred_func_index < 0)
fn = filter_pred_32;
else if (field_is_signed)
- fn = pred_funcs_s32[op - PRED_FUNC_START];
+ fn = pred_funcs_s32[pred_func_index];
else
- fn = pred_funcs_u32[op - PRED_FUNC_START];
+ fn = pred_funcs_u32[pred_func_index];
break;
case 2:
- if (op == OP_EQ || op == OP_NE)
+ if (pred_func_index < 0)
fn = filter_pred_16;
else if (field_is_signed)
- fn = pred_funcs_s16[op - PRED_FUNC_START];
+ fn = pred_funcs_s16[pred_func_index];
else
- fn = pred_funcs_u16[op - PRED_FUNC_START];
+ fn = pred_funcs_u16[pred_func_index];
break;
case 1:
- if (op == OP_EQ || op == OP_NE)
+ if (pred_func_index < 0)
fn = filter_pred_8;
else if (field_is_signed)
- fn = pred_funcs_s8[op - PRED_FUNC_START];
+ fn = pred_funcs_s8[pred_func_index];
else
- fn = pred_funcs_u8[op - PRED_FUNC_START];
+ fn = pred_funcs_u8[pred_func_index];
break;
}
return fn;
}
-static int init_pred(struct filter_parse_state *ps,
- struct ftrace_event_field *field,
- struct filter_pred *pred)
-
+/* Called when a predicate is encountered by predicate_parse() */
+static int parse_pred(const char *str, void *data,
+ int pos, struct filter_parse_error *pe,
+ struct filter_pred **pred_ptr)
{
- filter_pred_fn_t fn = filter_pred_none;
- unsigned long long val;
+ struct trace_event_call *call = data;
+ struct ftrace_event_field *field;
+ struct filter_pred *pred = NULL;
+ char num_buf[24]; /* Big enough to hold an address */
+ char *field_name;
+ char q;
+ u64 val;
+ int len;
int ret;
+ int op;
+ int s;
+ int i = 0;
- pred->offset = field->offset;
-
- if (!is_legal_op(field, pred->op)) {
- parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
- return -EINVAL;
- }
-
- if (field->filter_type == FILTER_COMM) {
- filter_build_regex(pred);
- fn = filter_pred_comm;
- pred->regex.field_len = TASK_COMM_LEN;
- } else if (is_string_field(field)) {
- filter_build_regex(pred);
-
- if (field->filter_type == FILTER_STATIC_STRING) {
- fn = filter_pred_string;
- pred->regex.field_len = field->size;
- } else if (field->filter_type == FILTER_DYN_STRING)
- fn = filter_pred_strloc;
- else
- fn = filter_pred_pchar;
- } else if (is_function_field(field)) {
- if (strcmp(field->name, "ip")) {
- parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
- return -EINVAL;
- }
- } else {
- if (field->is_signed)
- ret = kstrtoll(pred->regex.pattern, 0, &val);
- else
- ret = kstrtoull(pred->regex.pattern, 0, &val);
- if (ret) {
- parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
- return -EINVAL;
- }
- pred->val = val;
+ /* First find the field to associate to */
+ while (isspace(str[i]))
+ i++;
+ s = i;
- if (field->filter_type == FILTER_CPU)
- fn = filter_pred_cpu;
- else
- fn = select_comparison_fn(pred->op, field->size,
- field->is_signed);
- if (!fn) {
- parse_error(ps, FILT_ERR_INVALID_OP, 0);
- return -EINVAL;
- }
- }
+ while (isalnum(str[i]) || str[i] == '_')
+ i++;
- if (pred->op == OP_NE)
- pred->not ^= 1;
-
- pred->fn = fn;
- return 0;
-}
-
-static void parse_init(struct filter_parse_state *ps,
- struct filter_op *ops,
- char *infix_string)
-{
- memset(ps, '\0', sizeof(*ps));
+ len = i - s;
- ps->infix.string = infix_string;
- ps->infix.cnt = strlen(infix_string);
- ps->ops = ops;
+ if (!len)
+ return -1;
- INIT_LIST_HEAD(&ps->opstack);
- INIT_LIST_HEAD(&ps->postfix);
-}
-
-static char infix_next(struct filter_parse_state *ps)
-{
- if (!ps->infix.cnt)
- return 0;
-
- ps->infix.cnt--;
-
- return ps->infix.string[ps->infix.tail++];
-}
-
-static char infix_peek(struct filter_parse_state *ps)
-{
- if (ps->infix.tail == strlen(ps->infix.string))
- return 0;
-
- return ps->infix.string[ps->infix.tail];
-}
-
-static void infix_advance(struct filter_parse_state *ps)
-{
- if (!ps->infix.cnt)
- return;
-
- ps->infix.cnt--;
- ps->infix.tail++;
-}
-
-static inline int is_precedence_lower(struct filter_parse_state *ps,
- int a, int b)
-{
- return ps->ops[a].precedence < ps->ops[b].precedence;
-}
+ field_name = kmemdup_nul(str + s, len, GFP_KERNEL);
+ if (!field_name)
+ return -ENOMEM;
-static inline int is_op_char(struct filter_parse_state *ps, char c)
-{
- int i;
+ /* Make sure that the field exists */
- for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
- if (ps->ops[i].string[0] == c)
- return 1;
+ field = trace_find_event_field(call, field_name);
+ kfree(field_name);
+ if (!field) {
+ parse_error(pe, FILT_ERR_FIELD_NOT_FOUND, pos + i);
+ return -EINVAL;
}
- return 0;
-}
+ while (isspace(str[i]))
+ i++;
-static int infix_get_op(struct filter_parse_state *ps, char firstc)
-{
- char nextc = infix_peek(ps);
- char opstr[3];
- int i;
-
- opstr[0] = firstc;
- opstr[1] = nextc;
- opstr[2] = '\0';
-
- for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
- if (!strcmp(opstr, ps->ops[i].string)) {
- infix_advance(ps);
- return ps->ops[i].id;
- }
+ /* Make sure this op is supported */
+ for (op = 0; ops[op]; op++) {
+ /* This is why '<=' must come before '<' in ops[] */
+ if (strncmp(str + i, ops[op], strlen(ops[op])) == 0)
+ break;
}
- opstr[1] = '\0';
-
- for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
- if (!strcmp(opstr, ps->ops[i].string))
- return ps->ops[i].id;
+ if (!ops[op]) {
+ parse_error(pe, FILT_ERR_INVALID_OP, pos + i);
+ goto err_free;
}
- return OP_NONE;
-}
-
-static inline void clear_operand_string(struct filter_parse_state *ps)
-{
- memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
- ps->operand.tail = 0;
-}
+ i += strlen(ops[op]);
-static inline int append_operand_char(struct filter_parse_state *ps, char c)
-{
- if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
- return -EINVAL;
-
- ps->operand.string[ps->operand.tail++] = c;
-
- return 0;
-}
+ while (isspace(str[i]))
+ i++;
-static int filter_opstack_push(struct filter_parse_state *ps,
- enum filter_op_ids op)
-{
- struct opstack_op *opstack_op;
+ s = i;
- opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
- if (!opstack_op)
+ pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+ if (!pred)
return -ENOMEM;
- opstack_op->op = op;
- list_add(&opstack_op->list, &ps->opstack);
-
- return 0;
-}
+ pred->field = field;
+ pred->offset = field->offset;
+ pred->op = op;
-static int filter_opstack_empty(struct filter_parse_state *ps)
-{
- return list_empty(&ps->opstack);
-}
+ if (ftrace_event_is_function(call)) {
+ /*
+ * Perf does things different with function events.
+ * It only allows an "ip" field, and expects a string.
+ * But the string does not need to be surrounded by quotes.
+ * If it is a string, the assigned function as a nop,
+ * (perf doesn't use it) and grab everything.
+ */
+ if (strcmp(field->name, "ip") != 0) {
+ parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
+ goto err_free;
+ }
+ pred->fn = filter_pred_none;
+
+ /*
+ * Quotes are not required, but if they exist then we need
+ * to read them till we hit a matching one.
+ */
+ if (str[i] == '\'' || str[i] == '"')
+ q = str[i];
+ else
+ q = 0;
+
+ for (i++; str[i]; i++) {
+ if (q && str[i] == q)
+ break;
+ if (!q && (str[i] == ')' || str[i] == '&' ||
+ str[i] == '|'))
+ break;
+ }
+ /* Skip quotes */
+ if (q)
+ s++;
+ len = i - s;
+ if (len >= MAX_FILTER_STR_VAL) {
+ parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+ goto err_free;
+ }
-static int filter_opstack_top(struct filter_parse_state *ps)
-{
- struct opstack_op *opstack_op;
+ pred->regex.len = len;
+ strncpy(pred->regex.pattern, str + s, len);
+ pred->regex.pattern[len] = 0;
+
+ /* This is either a string, or an integer */
+ } else if (str[i] == '\'' || str[i] == '"') {
+ char q = str[i];
+
+ /* Make sure the op is OK for strings */
+ switch (op) {
+ case OP_NE:
+ pred->not = 1;
+ /* Fall through */
+ case OP_GLOB:
+ case OP_EQ:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
- if (filter_opstack_empty(ps))
- return OP_NONE;
+ /* Make sure the field is OK for strings */
+ if (!is_string_field(field)) {
+ parse_error(pe, FILT_ERR_EXPECT_DIGIT, pos + i);
+ goto err_free;
+ }
- opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+ for (i++; str[i]; i++) {
+ if (str[i] == q)
+ break;
+ }
+ if (!str[i]) {
+ parse_error(pe, FILT_ERR_MISSING_QUOTE, pos + i);
+ goto err_free;
+ }
- return opstack_op->op;
-}
+ /* Skip quotes */
+ s++;
+ len = i - s;
+ if (len >= MAX_FILTER_STR_VAL) {
+ parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+ goto err_free;
+ }
-static int filter_opstack_pop(struct filter_parse_state *ps)
-{
- struct opstack_op *opstack_op;
- enum filter_op_ids op;
+ pred->regex.len = len;
+ strncpy(pred->regex.pattern, str + s, len);
+ pred->regex.pattern[len] = 0;
- if (filter_opstack_empty(ps))
- return OP_NONE;
+ filter_build_regex(pred);
- opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
- op = opstack_op->op;
- list_del(&opstack_op->list);
+ if (field->filter_type == FILTER_COMM) {
+ pred->fn = filter_pred_comm;
- kfree(opstack_op);
+ } else if (field->filter_type == FILTER_STATIC_STRING) {
+ pred->fn = filter_pred_string;
+ pred->regex.field_len = field->size;
- return op;
-}
+ } else if (field->filter_type == FILTER_DYN_STRING)
+ pred->fn = filter_pred_strloc;
+ else
+ pred->fn = filter_pred_pchar;
+ /* go past the last quote */
+ i++;
-static void filter_opstack_clear(struct filter_parse_state *ps)
-{
- while (!filter_opstack_empty(ps))
- filter_opstack_pop(ps);
-}
+ } else if (isdigit(str[i])) {
-static char *curr_operand(struct filter_parse_state *ps)
-{
- return ps->operand.string;
-}
+ /* Make sure the field is not a string */
+ if (is_string_field(field)) {
+ parse_error(pe, FILT_ERR_EXPECT_STRING, pos + i);
+ goto err_free;
+ }
-static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
-{
- struct postfix_elt *elt;
+ if (op == OP_GLOB) {
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
- elt = kmalloc(sizeof(*elt), GFP_KERNEL);
- if (!elt)
- return -ENOMEM;
+ /* We allow 0xDEADBEEF */
+ while (isalnum(str[i]))
+ i++;
- elt->op = OP_NONE;
- elt->operand = kstrdup(operand, GFP_KERNEL);
- if (!elt->operand) {
- kfree(elt);
- return -ENOMEM;
- }
+ len = i - s;
+ /* 0xfeedfacedeadbeef is 18 chars max */
+ if (len >= sizeof(num_buf)) {
+ parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+ goto err_free;
+ }
- list_add_tail(&elt->list, &ps->postfix);
+ strncpy(num_buf, str + s, len);
+ num_buf[len] = 0;
- return 0;
-}
+ /* Make sure it is a value */
+ if (field->is_signed)
+ ret = kstrtoll(num_buf, 0, &val);
+ else
+ ret = kstrtoull(num_buf, 0, &val);
+ if (ret) {
+ parse_error(pe, FILT_ERR_ILLEGAL_INTVAL, pos + s);
+ goto err_free;
+ }
-static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op)
-{
- struct postfix_elt *elt;
+ pred->val = val;
- elt = kmalloc(sizeof(*elt), GFP_KERNEL);
- if (!elt)
- return -ENOMEM;
+ if (field->filter_type == FILTER_CPU)
+ pred->fn = filter_pred_cpu;
+ else {
+ pred->fn = select_comparison_fn(pred->op, field->size,
+ field->is_signed);
+ if (pred->op == OP_NE)
+ pred->not = 1;
+ }
- elt->op = op;
- elt->operand = NULL;
+ } else {
+ parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i);
+ goto err_free;
+ }
- list_add_tail(&elt->list, &ps->postfix);
+ *pred_ptr = pred;
+ return i;
- return 0;
+err_free:
+ kfree(pred);
+ return -EINVAL;
}
-static void postfix_clear(struct filter_parse_state *ps)
-{
- struct postfix_elt *elt;
+enum {
+ TOO_MANY_CLOSE = -1,
+ TOO_MANY_OPEN = -2,
+ MISSING_QUOTE = -3,
+};
- while (!list_empty(&ps->postfix)) {
- elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
- list_del(&elt->list);
- kfree(elt->operand);
- kfree(elt);
- }
-}
+/*
+ * Read the filter string once to calculate the number of predicates
+ * as well as how deep the parentheses go.
+ *
+ * Returns:
+ * 0 - everything is fine (err is undefined)
+ * -1 - too many ')'
+ * -2 - too many '('
+ * -3 - No matching quote
+ */
+static int calc_stack(const char *str, int *parens, int *preds, int *err)
+{
+ bool is_pred = false;
+ int nr_preds = 0;
+ int open = 1; /* Count the expression as "(E)" */
+ int last_quote = 0;
+ int max_open = 1;
+ int quote = 0;
+ int i;
-static int filter_parse(struct filter_parse_state *ps)
-{
- enum filter_op_ids op, top_op;
- int in_string = 0;
- char ch;
+ *err = 0;
- while ((ch = infix_next(ps))) {
- if (ch == '"') {
- in_string ^= 1;
+ for (i = 0; str[i]; i++) {
+ if (isspace(str[i]))
continue;
- }
-
- if (in_string)
- goto parse_operand;
-
- if (isspace(ch))
+ if (quote) {
+ if (str[i] == quote)
+ quote = 0;
continue;
+ }
- if (is_op_char(ps, ch)) {
- op = infix_get_op(ps, ch);
- if (op == OP_NONE) {
- parse_error(ps, FILT_ERR_INVALID_OP, 0);
- return -EINVAL;
- }
-
- if (strlen(curr_operand(ps))) {
- postfix_append_operand(ps, curr_operand(ps));
- clear_operand_string(ps);
- }
-
- while (!filter_opstack_empty(ps)) {
- top_op = filter_opstack_top(ps);
- if (!is_precedence_lower(ps, top_op, op)) {
- top_op = filter_opstack_pop(ps);
- postfix_append_op(ps, top_op);
- continue;
- }
+ switch (str[i]) {
+ case '\'':
+ case '"':
+ quote = str[i];
+ last_quote = i;
+ break;
+ case '|':
+ case '&':
+ if (str[i+1] != str[i])
break;
- }
-
- filter_opstack_push(ps, op);
+ is_pred = false;
continue;
- }
-
- if (ch == '(') {
- filter_opstack_push(ps, OP_OPEN_PAREN);
+ case '(':
+ is_pred = false;
+ open++;
+ if (open > max_open)
+ max_open = open;
continue;
- }
-
- if (ch == ')') {
- if (strlen(curr_operand(ps))) {
- postfix_append_operand(ps, curr_operand(ps));
- clear_operand_string(ps);
- }
-
- top_op = filter_opstack_pop(ps);
- while (top_op != OP_NONE) {
- if (top_op == OP_OPEN_PAREN)
- break;
- postfix_append_op(ps, top_op);
- top_op = filter_opstack_pop(ps);
- }
- if (top_op == OP_NONE) {
- parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
- return -EINVAL;
+ case ')':
+ is_pred = false;
+ if (open == 1) {
+ *err = i;
+ return TOO_MANY_CLOSE;
}
+ open--;
continue;
}
-parse_operand:
- if (append_operand_char(ps, ch)) {
- parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
- return -EINVAL;
- }
- }
-
- if (strlen(curr_operand(ps)))
- postfix_append_operand(ps, curr_operand(ps));
-
- while (!filter_opstack_empty(ps)) {
- top_op = filter_opstack_pop(ps);
- if (top_op == OP_NONE)
- break;
- if (top_op == OP_OPEN_PAREN) {
- parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
- return -EINVAL;
+ if (!is_pred) {
+ nr_preds++;
+ is_pred = true;
}
- postfix_append_op(ps, top_op);
- }
-
- return 0;
-}
-
-static struct filter_pred *create_pred(struct filter_parse_state *ps,
- struct trace_event_call *call,
- enum filter_op_ids op,
- char *operand1, char *operand2)
-{
- struct ftrace_event_field *field;
- static struct filter_pred pred;
-
- memset(&pred, 0, sizeof(pred));
- pred.op = op;
-
- if (op == OP_AND || op == OP_OR)
- return &pred;
-
- if (!operand1 || !operand2) {
- parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
- return NULL;
}
- field = trace_find_event_field(call, operand1);
- if (!field) {
- parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
- return NULL;
+ if (quote) {
+ *err = last_quote;
+ return MISSING_QUOTE;
}
- strcpy(pred.regex.pattern, operand2);
- pred.regex.len = strlen(pred.regex.pattern);
- pred.field = field;
- return init_pred(ps, field, &pred) ? NULL : &pred;
-}
+ if (open != 1) {
+ int level = open;
-static int check_preds(struct filter_parse_state *ps)
-{
- int n_normal_preds = 0, n_logical_preds = 0;
- struct postfix_elt *elt;
- int cnt = 0;
-
- list_for_each_entry(elt, &ps->postfix, list) {
- if (elt->op == OP_NONE) {
- cnt++;
- continue;
- }
-
- if (elt->op == OP_AND || elt->op == OP_OR) {
- n_logical_preds++;
- cnt--;
- continue;
+ /* find the bad open */
+ for (i--; i; i--) {
+ if (quote) {
+ if (str[i] == quote)
+ quote = 0;
+ continue;
+ }
+ switch (str[i]) {
+ case '(':
+ if (level == open) {
+ *err = i;
+ return TOO_MANY_OPEN;
+ }
+ level--;
+ break;
+ case ')':
+ level++;
+ break;
+ case '\'':
+ case '"':
+ quote = str[i];
+ break;
+ }
}
- if (elt->op != OP_NOT)
- cnt--;
- n_normal_preds++;
- /* all ops should have operands */
- if (cnt < 0)
- break;
- }
-
- if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) {
- parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
- return -EINVAL;
+ /* First character is the '(' with missing ')' */
+ *err = 0;
+ return TOO_MANY_OPEN;
}
+ /* Set the size of the required stacks */
+ *parens = max_open;
+ *preds = nr_preds;
return 0;
}
-static int count_preds(struct filter_parse_state *ps)
-{
- struct postfix_elt *elt;
- int n_preds = 0;
-
- list_for_each_entry(elt, &ps->postfix, list) {
- if (elt->op == OP_NONE)
- continue;
- n_preds++;
- }
-
- return n_preds;
-}
-
-struct check_pred_data {
- int count;
- int max;
-};
-
-static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
-{
- struct check_pred_data *d = data;
-
- if (WARN_ON(d->count++ > d->max)) {
- *err = -EINVAL;
- return WALK_PRED_ABORT;
- }
- return WALK_PRED_DEFAULT;
-}
-
-/*
- * The tree is walked at filtering of an event. If the tree is not correctly
- * built, it may cause an infinite loop. Check here that the tree does
- * indeed terminate.
- */
-static int check_pred_tree(struct event_filter *filter,
- struct filter_pred *root)
-{
- struct check_pred_data data = {
- /*
- * The max that we can hit a node is three times.
- * Once going down, once coming up from left, and
- * once coming up from right. This is more than enough
- * since leafs are only hit a single time.
- */
- .max = 3 * filter->n_preds,
- .count = 0,
- };
-
- return walk_pred_tree(filter->preds, root,
- check_pred_tree_cb, &data);
-}
-
-static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
-{
- int *count = data;
-
- if ((move == MOVE_DOWN) &&
- (pred->left == FILTER_PRED_INVALID))
- (*count)++;
-
- return WALK_PRED_DEFAULT;
-}
-
-static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
-{
- int count = 0, ret;
-
- ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
- WARN_ON(ret);
- return count;
-}
-
-struct fold_pred_data {
- struct filter_pred *root;
- int count;
- int children;
-};
-
-static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
-{
- struct fold_pred_data *d = data;
- struct filter_pred *root = d->root;
-
- if (move != MOVE_DOWN)
- return WALK_PRED_DEFAULT;
- if (pred->left != FILTER_PRED_INVALID)
- return WALK_PRED_DEFAULT;
-
- if (WARN_ON(d->count == d->children)) {
- *err = -EINVAL;
- return WALK_PRED_ABORT;
- }
-
- pred->index &= ~FILTER_PRED_FOLD;
- root->ops[d->count++] = pred->index;
- return WALK_PRED_DEFAULT;
-}
-
-static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
-{
- struct fold_pred_data data = {
- .root = root,
- .count = 0,
- };
- int children;
-
- /* No need to keep the fold flag */
- root->index &= ~FILTER_PRED_FOLD;
-
- /* If the root is a leaf then do nothing */
- if (root->left == FILTER_PRED_INVALID)
- return 0;
-
- /* count the children */
- children = count_leafs(preds, &preds[root->left]);
- children += count_leafs(preds, &preds[root->right]);
-
- root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
- if (!root->ops)
- return -ENOMEM;
-
- root->val = children;
- data.children = children;
- return walk_pred_tree(preds, root, fold_pred_cb, &data);
-}
-
-static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
-{
- struct filter_pred *preds = data;
-
- if (move != MOVE_DOWN)
- return WALK_PRED_DEFAULT;
- if (!(pred->index & FILTER_PRED_FOLD))
- return WALK_PRED_DEFAULT;
-
- *err = fold_pred(preds, pred);
- if (*err)
- return WALK_PRED_ABORT;
-
- /* eveyrhing below is folded, continue with parent */
- return WALK_PRED_PARENT;
-}
-
-/*
- * To optimize the processing of the ops, if we have several "ors" or
- * "ands" together, we can put them in an array and process them all
- * together speeding up the filter logic.
- */
-static int fold_pred_tree(struct event_filter *filter,
- struct filter_pred *root)
-{
- return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
- filter->preds);
-}
-
-static int replace_preds(struct trace_event_call *call,
+static int process_preds(struct trace_event_call *call,
+ const char *filter_string,
struct event_filter *filter,
- struct filter_parse_state *ps,
- bool dry_run)
+ struct filter_parse_error *pe)
{
- char *operand1 = NULL, *operand2 = NULL;
- struct filter_pred *pred;
- struct filter_pred *root;
- struct postfix_elt *elt;
- struct pred_stack stack = { }; /* init to NULL */
- int err;
- int n_preds = 0;
-
- n_preds = count_preds(ps);
- if (n_preds >= MAX_FILTER_PRED) {
- parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
- return -ENOSPC;
- }
-
- err = check_preds(ps);
- if (err)
- return err;
-
- if (!dry_run) {
- err = __alloc_pred_stack(&stack, n_preds);
- if (err)
- return err;
- err = __alloc_preds(filter, n_preds);
- if (err)
- goto fail;
- }
-
- n_preds = 0;
- list_for_each_entry(elt, &ps->postfix, list) {
- if (elt->op == OP_NONE) {
- if (!operand1)
- operand1 = elt->operand;
- else if (!operand2)
- operand2 = elt->operand;
- else {
- parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
- err = -EINVAL;
- goto fail;
- }
- continue;
- }
-
- if (elt->op == OP_NOT) {
- if (!n_preds || operand1 || operand2) {
- parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
- err = -EINVAL;
- goto fail;
- }
- if (!dry_run)
- filter->preds[n_preds - 1].not ^= 1;
- continue;
- }
-
- if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
- parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
- err = -ENOSPC;
- goto fail;
- }
-
- pred = create_pred(ps, call, elt->op, operand1, operand2);
- if (!pred) {
- err = -EINVAL;
- goto fail;
- }
+ struct prog_entry *prog;
+ int nr_parens;
+ int nr_preds;
+ int index;
+ int ret;
- if (!dry_run) {
- err = filter_add_pred(ps, filter, pred, &stack);
- if (err)
- goto fail;
+ ret = calc_stack(filter_string, &nr_parens, &nr_preds, &index);
+ if (ret < 0) {
+ switch (ret) {
+ case MISSING_QUOTE:
+ parse_error(pe, FILT_ERR_MISSING_QUOTE, index);
+ break;
+ case TOO_MANY_OPEN:
+ parse_error(pe, FILT_ERR_TOO_MANY_OPEN, index);
+ break;
+ default:
+ parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, index);
}
-
- operand1 = operand2 = NULL;
+ return ret;
}
- if (!dry_run) {
- /* We should have one item left on the stack */
- pred = __pop_pred_stack(&stack);
- if (!pred)
- return -EINVAL;
- /* This item is where we start from in matching */
- root = pred;
- /* Make sure the stack is empty */
- pred = __pop_pred_stack(&stack);
- if (WARN_ON(pred)) {
- err = -EINVAL;
- filter->root = NULL;
- goto fail;
- }
- err = check_pred_tree(filter, root);
- if (err)
- goto fail;
-
- /* Optimize the tree */
- err = fold_pred_tree(filter, root);
- if (err)
- goto fail;
-
- /* We don't set root until we know it works */
- barrier();
- filter->root = root;
- }
+ if (!nr_preds)
+ return -EINVAL;
- err = 0;
-fail:
- __free_pred_stack(&stack);
- return err;
+ prog = predicate_parse(filter_string, nr_parens, nr_preds,
+ parse_pred, call, pe);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ rcu_assign_pointer(filter->prog, prog);
+ return 0;
}
static inline void event_set_filtered_flag(struct trace_event_file *file)
@@ -1780,72 +1562,53 @@ struct filter_list {
struct event_filter *filter;
};
-static int replace_system_preds(struct trace_subsystem_dir *dir,
+static int process_system_preds(struct trace_subsystem_dir *dir,
struct trace_array *tr,
- struct filter_parse_state *ps,
+ struct filter_parse_error *pe,
char *filter_string)
{
struct trace_event_file *file;
struct filter_list *filter_item;
+ struct event_filter *filter = NULL;
struct filter_list *tmp;
LIST_HEAD(filter_list);
bool fail = true;
int err;
list_for_each_entry(file, &tr->events, list) {
- if (file->system != dir)
- continue;
-
- /*
- * Try to see if the filter can be applied
- * (filter arg is ignored on dry_run)
- */
- err = replace_preds(file->event_call, NULL, ps, true);
- if (err)
- event_set_no_set_filter_flag(file);
- else
- event_clear_no_set_filter_flag(file);
- }
-
- list_for_each_entry(file, &tr->events, list) {
- struct event_filter *filter;
if (file->system != dir)
continue;
- if (event_no_set_filter_flag(file))
- continue;
-
- filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
- if (!filter_item)
- goto fail_mem;
-
- list_add_tail(&filter_item->list, &filter_list);
-
- filter_item->filter = __alloc_filter();
- if (!filter_item->filter)
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
goto fail_mem;
- filter = filter_item->filter;
- /* Can only fail on no memory */
- err = replace_filter_string(filter, filter_string);
- if (err)
+ filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+ if (!filter->filter_string)
goto fail_mem;
- err = replace_preds(file->event_call, filter, ps, false);
+ err = process_preds(file->event_call, filter_string, filter, pe);
if (err) {
filter_disable(file);
- parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
- append_filter_err(ps, filter);
+ parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ append_filter_err(pe, filter);
} else
event_set_filtered_flag(file);
+
+
+ filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
+ if (!filter_item)
+ goto fail_mem;
+
+ list_add_tail(&filter_item->list, &filter_list);
/*
* Regardless of if this returned an error, we still
* replace the filter for the call.
*/
- filter = event_filter(file);
- event_set_filter(file, filter_item->filter);
- filter_item->filter = filter;
+ filter_item->filter = event_filter(file);
+ event_set_filter(file, filter);
+ filter = NULL;
fail = false;
}
@@ -1871,9 +1634,10 @@ static int replace_system_preds(struct trace_subsystem_dir *dir,
list_del(&filter_item->list);
kfree(filter_item);
}
- parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
+ kfree(filter);
/* If any call succeeded, we still need to sync */
if (!fail)
synchronize_sched();
@@ -1885,47 +1649,42 @@ static int replace_system_preds(struct trace_subsystem_dir *dir,
return -ENOMEM;
}
-static int create_filter_start(char *filter_str, bool set_str,
- struct filter_parse_state **psp,
+static int create_filter_start(char *filter_string, bool set_str,
+ struct filter_parse_error **pse,
struct event_filter **filterp)
{
struct event_filter *filter;
- struct filter_parse_state *ps = NULL;
+ struct filter_parse_error *pe = NULL;
int err = 0;
- WARN_ON_ONCE(*psp || *filterp);
+ if (WARN_ON_ONCE(*pse || *filterp))
+ return -EINVAL;
- /* allocate everything, and if any fails, free all and fail */
- filter = __alloc_filter();
- if (filter && set_str)
- err = replace_filter_string(filter, filter_str);
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (filter && set_str) {
+ filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+ if (!filter->filter_string)
+ err = -ENOMEM;
+ }
- ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ pe = kzalloc(sizeof(*pe), GFP_KERNEL);
- if (!filter || !ps || err) {
- kfree(ps);
+ if (!filter || !pe || err) {
+ kfree(pe);
__free_filter(filter);
return -ENOMEM;
}
/* we're committed to creating a new filter */
*filterp = filter;
- *psp = ps;
+ *pse = pe;
- parse_init(ps, filter_ops, filter_str);
- err = filter_parse(ps);
- if (err && set_str)
- append_filter_err(ps, filter);
- return err;
+ return 0;
}
-static void create_filter_finish(struct filter_parse_state *ps)
+static void create_filter_finish(struct filter_parse_error *pe)
{
- if (ps) {
- filter_opstack_clear(ps);
- postfix_clear(ps);
- kfree(ps);
- }
+ kfree(pe);
}
/**
@@ -1945,26 +1704,20 @@ static void create_filter_finish(struct filter_parse_state *ps)
* freeing it.
*/
static int create_filter(struct trace_event_call *call,
- char *filter_str, bool set_str,
+ char *filter_string, bool set_str,
struct event_filter **filterp)
{
- struct event_filter *filter = NULL;
- struct filter_parse_state *ps = NULL;
+ struct filter_parse_error *pe = NULL;
int err;
- err = create_filter_start(filter_str, set_str, &ps, &filter);
- if (!err) {
- err = replace_preds(call, filter, ps, false);
- if (err && set_str)
- append_filter_err(ps, filter);
- }
- if (err && !set_str) {
- free_event_filter(filter);
- filter = NULL;
- }
- create_filter_finish(ps);
+ err = create_filter_start(filter_string, set_str, &pe, filterp);
+ if (err)
+ return err;
+
+ err = process_preds(call, filter_string, *filterp, pe);
+ if (err && set_str)
+ append_filter_err(pe, *filterp);
- *filterp = filter;
return err;
}
@@ -1988,24 +1741,22 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
- struct event_filter *filter = NULL;
- struct filter_parse_state *ps = NULL;
+ struct filter_parse_error *pe = NULL;
int err;
- err = create_filter_start(filter_str, true, &ps, &filter);
+ err = create_filter_start(filter_str, true, &pe, filterp);
if (!err) {
- err = replace_system_preds(dir, tr, ps, filter_str);
+ err = process_system_preds(dir, tr, pe, filter_str);
if (!err) {
/* System filters just show a default message */
- kfree(filter->filter_string);
- filter->filter_string = NULL;
+ kfree((*filterp)->filter_string);
+ (*filterp)->filter_string = NULL;
} else {
- append_filter_err(ps, filter);
+ append_filter_err(pe, *filterp);
}
}
- create_filter_finish(ps);
+ create_filter_finish(pe);
- *filterp = filter;
return err;
}
@@ -2013,7 +1764,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
int apply_event_filter(struct trace_event_file *file, char *filter_string)
{
struct trace_event_call *call = file->event_call;
- struct event_filter *filter;
+ struct event_filter *filter = NULL;
int err;
if (!strcmp(strstrip(filter_string), "0")) {
@@ -2066,7 +1817,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
{
struct event_subsystem *system = dir->subsystem;
struct trace_array *tr = dir->tr;
- struct event_filter *filter;
+ struct event_filter *filter = NULL;
int err = 0;
mutex_lock(&event_mutex);
@@ -2186,66 +1937,80 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len,
return ret;
}
-static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
+static int ftrace_function_check_pred(struct filter_pred *pred)
{
struct ftrace_event_field *field = pred->field;
- if (leaf) {
- /*
- * Check the leaf predicate for function trace, verify:
- * - only '==' and '!=' is used
- * - the 'ip' field is used
- */
- if ((pred->op != OP_EQ) && (pred->op != OP_NE))
- return -EINVAL;
+ /*
+ * Check the predicate for function trace, verify:
+ * - only '==' and '!=' is used
+ * - the 'ip' field is used
+ */
+ if ((pred->op != OP_EQ) && (pred->op != OP_NE))
+ return -EINVAL;
- if (strcmp(field->name, "ip"))
- return -EINVAL;
- } else {
- /*
- * Check the non leaf predicate for function trace, verify:
- * - only '||' is used
- */
- if (pred->op != OP_OR)
- return -EINVAL;
- }
+ if (strcmp(field->name, "ip"))
+ return -EINVAL;
return 0;
}
-static int ftrace_function_set_filter_cb(enum move_type move,
- struct filter_pred *pred,
- int *err, void *data)
+static int ftrace_function_set_filter_pred(struct filter_pred *pred,
+ struct function_filter_data *data)
{
+ int ret;
+
/* Checking the node is valid for function trace. */
- if ((move != MOVE_DOWN) ||
- (pred->left != FILTER_PRED_INVALID)) {
- *err = ftrace_function_check_pred(pred, 0);
- } else {
- *err = ftrace_function_check_pred(pred, 1);
- if (*err)
- return WALK_PRED_ABORT;
-
- *err = __ftrace_function_set_filter(pred->op == OP_EQ,
- pred->regex.pattern,
- pred->regex.len,
- data);
- }
+ ret = ftrace_function_check_pred(pred);
+ if (ret)
+ return ret;
- return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
+ return __ftrace_function_set_filter(pred->op == OP_EQ,
+ pred->regex.pattern,
+ pred->regex.len,
+ data);
+}
+
+static bool is_or(struct prog_entry *prog, int i)
+{
+ int target;
+
+ /*
+ * Only "||" is allowed for function events, thus,
+ * all true branches should jump to true, and any
+ * false branch should jump to false.
+ */
+ target = prog[i].target + 1;
+ /* True and false have NULL preds (all prog entries should jump to one */
+ if (prog[target].pred)
+ return false;
+
+ /* prog[target].target is 1 for TRUE, 0 for FALSE */
+ return prog[i].when_to_branch == prog[target].target;
}
static int ftrace_function_set_filter(struct perf_event *event,
struct event_filter *filter)
{
+ struct prog_entry *prog = rcu_dereference_protected(filter->prog,
+ lockdep_is_held(&event_mutex));
struct function_filter_data data = {
.first_filter = 1,
.first_notrace = 1,
.ops = &event->ftrace_ops,
};
+ int i;
+
+ for (i = 0; prog[i].pred; i++) {
+ struct filter_pred *pred = prog[i].pred;
- return walk_pred_tree(filter->preds, filter->root,
- ftrace_function_set_filter_cb, &data);
+ if (!is_or(prog, i))
+ return -EINVAL;
+
+ if (ftrace_function_set_filter_pred(pred, &data) < 0)
+ return -EINVAL;
+ }
+ return 0;
}
#else
static int ftrace_function_set_filter(struct perf_event *event,
@@ -2259,7 +2024,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str)
{
int err;
- struct event_filter *filter;
+ struct event_filter *filter = NULL;
struct trace_event_call *call;
mutex_lock(&event_mutex);
@@ -2375,7 +2140,7 @@ static struct test_filter_data_t {
#undef YES
#undef NO
-#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
+#define DATA_CNT ARRAY_SIZE(test_filter_data)
static int test_pred_visited;
@@ -2388,26 +2153,28 @@ static int test_pred_visited_fn(struct filter_pred *pred, void *event)
return 1;
}
-static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
- int *err, void *data)
+static void update_pred_fn(struct event_filter *filter, char *fields)
{
- char *fields = data;
+ struct prog_entry *prog = rcu_dereference_protected(filter->prog,
+ lockdep_is_held(&event_mutex));
+ int i;
- if ((move == MOVE_DOWN) &&
- (pred->left == FILTER_PRED_INVALID)) {
+ for (i = 0; prog[i].pred; i++) {
+ struct filter_pred *pred = prog[i].pred;
struct ftrace_event_field *field = pred->field;
+ WARN_ON_ONCE(!pred->fn);
+
if (!field) {
- WARN(1, "all leafs should have field defined");
- return WALK_PRED_DEFAULT;
+ WARN_ONCE(1, "all leafs should have field defined %d", i);
+ continue;
}
+
if (!strchr(fields, *field->name))
- return WALK_PRED_DEFAULT;
+ continue;
- WARN_ON(!pred->fn);
pred->fn = test_pred_visited_fn;
}
- return WALK_PRED_DEFAULT;
}
static __init int ftrace_test_event_filter(void)
@@ -2431,20 +2198,22 @@ static __init int ftrace_test_event_filter(void)
break;
}
+ /* Needed to dereference filter->prog */
+ mutex_lock(&event_mutex);
/*
* The preemption disabling is not really needed for self
* tests, but the rcu dereference will complain without it.
*/
preempt_disable();
if (*d->not_visited)
- walk_pred_tree(filter->preds, filter->root,
- test_walk_pred_cb,
- d->not_visited);
+ update_pred_fn(filter, d->not_visited);
test_pred_visited = 0;
err = filter_match_preds(filter, &d->rec);
preempt_enable();
+ mutex_unlock(&event_mutex);
+
__free_filter(filter);
if (test_pred_visited) {
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1e1558c99d56..046c716a6536 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -20,15 +20,39 @@
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/rculist.h>
+#include <linux/tracefs.h>
#include "tracing_map.h"
#include "trace.h"
+#define SYNTH_SYSTEM "synthetic"
+#define SYNTH_FIELDS_MAX 16
+
+#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
+
struct hist_field;
-typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+typedef u64 (*hist_field_fn_t) (struct hist_field *field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event);
#define HIST_FIELD_OPERANDS_MAX 2
+#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
+#define HIST_ACTIONS_MAX 8
+
+enum field_op_id {
+ FIELD_OP_NONE,
+ FIELD_OP_PLUS,
+ FIELD_OP_MINUS,
+ FIELD_OP_UNARY_MINUS,
+};
+
+struct hist_var {
+ char *name;
+ struct hist_trigger_data *hist_data;
+ unsigned int idx;
+};
struct hist_field {
struct ftrace_event_field *field;
@@ -37,27 +61,49 @@ struct hist_field {
unsigned int size;
unsigned int offset;
unsigned int is_signed;
+ const char *type;
struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
+ struct hist_trigger_data *hist_data;
+ struct hist_var var;
+ enum field_op_id operator;
+ char *system;
+ char *event_name;
+ char *name;
+ unsigned int var_idx;
+ unsigned int var_ref_idx;
+ bool read_once;
};
-static u64 hist_field_none(struct hist_field *field, void *event)
+static u64 hist_field_none(struct hist_field *field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
return 0;
}
-static u64 hist_field_counter(struct hist_field *field, void *event)
+static u64 hist_field_counter(struct hist_field *field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
return 1;
}
-static u64 hist_field_string(struct hist_field *hist_field, void *event)
+static u64 hist_field_string(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
char *addr = (char *)(event + hist_field->field->offset);
return (u64)(unsigned long)addr;
}
-static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
+static u64 hist_field_dynstring(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
u32 str_item = *(u32 *)(event + hist_field->field->offset);
int str_loc = str_item & 0xffff;
@@ -66,24 +112,74 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
return (u64)(unsigned long)addr;
}
-static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
+static u64 hist_field_pstring(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
char **addr = (char **)(event + hist_field->field->offset);
return (u64)(unsigned long)*addr;
}
-static u64 hist_field_log2(struct hist_field *hist_field, void *event)
+static u64 hist_field_log2(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
{
struct hist_field *operand = hist_field->operands[0];
- u64 val = operand->fn(operand, event);
+ u64 val = operand->fn(operand, elt, rbe, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
+static u64 hist_field_plus(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, rbe, event);
+
+ return val1 + val2;
+}
+
+static u64 hist_field_minus(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, rbe, event);
+
+ return val1 - val2;
+}
+
+static u64 hist_field_unary_minus(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand = hist_field->operands[0];
+
+ s64 sval = (s64)operand->fn(operand, elt, rbe, event);
+ u64 val = (u64)-sval;
+
+ return val;
+}
+
#define DEFINE_HIST_FIELD_FN(type) \
-static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
+ static u64 hist_field_##type(struct hist_field *hist_field, \
+ struct tracing_map_elt *elt, \
+ struct ring_buffer_event *rbe, \
+ void *event) \
{ \
type *addr = (type *)(event + hist_field->field->offset); \
\
@@ -126,6 +222,19 @@ enum hist_field_flags {
HIST_FIELD_FL_SYSCALL = 1 << 7,
HIST_FIELD_FL_STACKTRACE = 1 << 8,
HIST_FIELD_FL_LOG2 = 1 << 9,
+ HIST_FIELD_FL_TIMESTAMP = 1 << 10,
+ HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11,
+ HIST_FIELD_FL_VAR = 1 << 12,
+ HIST_FIELD_FL_EXPR = 1 << 13,
+ HIST_FIELD_FL_VAR_REF = 1 << 14,
+ HIST_FIELD_FL_CPU = 1 << 15,
+ HIST_FIELD_FL_ALIAS = 1 << 16,
+};
+
+struct var_defs {
+ unsigned int n_vars;
+ char *name[TRACING_MAP_VARS_MAX];
+ char *expr[TRACING_MAP_VARS_MAX];
};
struct hist_trigger_attrs {
@@ -133,25 +242,1437 @@ struct hist_trigger_attrs {
char *vals_str;
char *sort_key_str;
char *name;
+ char *clock;
bool pause;
bool cont;
bool clear;
+ bool ts_in_usecs;
unsigned int map_bits;
+
+ char *assignment_str[TRACING_MAP_VARS_MAX];
+ unsigned int n_assignments;
+
+ char *action_str[HIST_ACTIONS_MAX];
+ unsigned int n_actions;
+
+ struct var_defs var_defs;
+};
+
+struct field_var {
+ struct hist_field *var;
+ struct hist_field *val;
+};
+
+struct field_var_hist {
+ struct hist_trigger_data *hist_data;
+ char *cmd;
};
struct hist_trigger_data {
- struct hist_field *fields[TRACING_MAP_FIELDS_MAX];
+ struct hist_field *fields[HIST_FIELDS_MAX];
unsigned int n_vals;
unsigned int n_keys;
unsigned int n_fields;
+ unsigned int n_vars;
unsigned int key_size;
struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
unsigned int n_sort_keys;
struct trace_event_file *event_file;
struct hist_trigger_attrs *attrs;
struct tracing_map *map;
+ bool enable_timestamps;
+ bool remove;
+ struct hist_field *var_refs[TRACING_MAP_VARS_MAX];
+ unsigned int n_var_refs;
+
+ struct action_data *actions[HIST_ACTIONS_MAX];
+ unsigned int n_actions;
+
+ struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX];
+ unsigned int n_synth_var_refs;
+ struct field_var *field_vars[SYNTH_FIELDS_MAX];
+ unsigned int n_field_vars;
+ unsigned int n_field_var_str;
+ struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX];
+ unsigned int n_field_var_hists;
+
+ struct field_var *max_vars[SYNTH_FIELDS_MAX];
+ unsigned int n_max_vars;
+ unsigned int n_max_var_str;
+};
+
+struct synth_field {
+ char *type;
+ char *name;
+ size_t size;
+ bool is_signed;
+ bool is_string;
+};
+
+struct synth_event {
+ struct list_head list;
+ int ref;
+ char *name;
+ struct synth_field **fields;
+ unsigned int n_fields;
+ unsigned int n_u64;
+ struct trace_event_class class;
+ struct trace_event_call call;
+ struct tracepoint *tp;
+};
+
+struct action_data;
+
+typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe,
+ struct action_data *data, u64 *var_ref_vals);
+
+struct action_data {
+ action_fn_t fn;
+ unsigned int n_params;
+ char *params[SYNTH_FIELDS_MAX];
+
+ union {
+ struct {
+ unsigned int var_ref_idx;
+ char *match_event;
+ char *match_event_system;
+ char *synth_event_name;
+ struct synth_event *synth_event;
+ } onmatch;
+
+ struct {
+ char *var_str;
+ char *fn_name;
+ unsigned int max_var_ref_idx;
+ struct hist_field *max_var;
+ struct hist_field *var;
+ } onmax;
+ };
+};
+
+
+static char last_hist_cmd[MAX_FILTER_STR_VAL];
+static char hist_err_str[MAX_FILTER_STR_VAL];
+
+static void last_cmd_set(char *str)
+{
+ if (!str)
+ return;
+
+ strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
+}
+
+static void hist_err(char *str, char *var)
+{
+ int maxlen = MAX_FILTER_STR_VAL - 1;
+
+ if (!str)
+ return;
+
+ if (strlen(hist_err_str))
+ return;
+
+ if (!var)
+ var = "";
+
+ if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
+ return;
+
+ strcat(hist_err_str, str);
+ strcat(hist_err_str, var);
+}
+
+static void hist_err_event(char *str, char *system, char *event, char *var)
+{
+ char err[MAX_FILTER_STR_VAL];
+
+ if (system && var)
+ snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
+ else if (system)
+ snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
+ else
+ strncpy(err, var, MAX_FILTER_STR_VAL);
+
+ hist_err(str, err);
+}
+
+static void hist_err_clear(void)
+{
+ hist_err_str[0] = '\0';
+}
+
+static bool have_hist_err(void)
+{
+ if (strlen(hist_err_str))
+ return true;
+
+ return false;
+}
+
+static LIST_HEAD(synth_event_list);
+static DEFINE_MUTEX(synth_event_mutex);
+
+struct synth_trace_event {
+ struct trace_entry ent;
+ u64 fields[];
+};
+
+static int synth_event_define_fields(struct trace_event_call *call)
+{
+ struct synth_trace_event trace;
+ int offset = offsetof(typeof(trace), fields);
+ struct synth_event *event = call->data;
+ unsigned int i, size, n_u64;
+ char *name, *type;
+ bool is_signed;
+ int ret = 0;
+
+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ size = event->fields[i]->size;
+ is_signed = event->fields[i]->is_signed;
+ type = event->fields[i]->type;
+ name = event->fields[i]->name;
+ ret = trace_define_field(call, type, name, offset, size,
+ is_signed, FILTER_OTHER);
+ if (ret)
+ break;
+
+ if (event->fields[i]->is_string) {
+ offset += STR_VAR_LEN_MAX;
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ offset += sizeof(u64);
+ n_u64++;
+ }
+ }
+
+ event->n_u64 = n_u64;
+
+ return ret;
+}
+
+static bool synth_field_signed(char *type)
+{
+ if (strncmp(type, "u", 1) == 0)
+ return false;
+
+ return true;
+}
+
+static int synth_field_is_string(char *type)
+{
+ if (strstr(type, "char[") != NULL)
+ return true;
+
+ return false;
+}
+
+static int synth_field_string_size(char *type)
+{
+ char buf[4], *end, *start;
+ unsigned int len;
+ int size, err;
+
+ start = strstr(type, "char[");
+ if (start == NULL)
+ return -EINVAL;
+ start += strlen("char[");
+
+ end = strchr(type, ']');
+ if (!end || end < start)
+ return -EINVAL;
+
+ len = end - start;
+ if (len > 3)
+ return -EINVAL;
+
+ strncpy(buf, start, len);
+ buf[len] = '\0';
+
+ err = kstrtouint(buf, 0, &size);
+ if (err)
+ return err;
+
+ if (size > STR_VAR_LEN_MAX)
+ return -EINVAL;
+
+ return size;
+}
+
+static int synth_field_size(char *type)
+{
+ int size = 0;
+
+ if (strcmp(type, "s64") == 0)
+ size = sizeof(s64);
+ else if (strcmp(type, "u64") == 0)
+ size = sizeof(u64);
+ else if (strcmp(type, "s32") == 0)
+ size = sizeof(s32);
+ else if (strcmp(type, "u32") == 0)
+ size = sizeof(u32);
+ else if (strcmp(type, "s16") == 0)
+ size = sizeof(s16);
+ else if (strcmp(type, "u16") == 0)
+ size = sizeof(u16);
+ else if (strcmp(type, "s8") == 0)
+ size = sizeof(s8);
+ else if (strcmp(type, "u8") == 0)
+ size = sizeof(u8);
+ else if (strcmp(type, "char") == 0)
+ size = sizeof(char);
+ else if (strcmp(type, "unsigned char") == 0)
+ size = sizeof(unsigned char);
+ else if (strcmp(type, "int") == 0)
+ size = sizeof(int);
+ else if (strcmp(type, "unsigned int") == 0)
+ size = sizeof(unsigned int);
+ else if (strcmp(type, "long") == 0)
+ size = sizeof(long);
+ else if (strcmp(type, "unsigned long") == 0)
+ size = sizeof(unsigned long);
+ else if (strcmp(type, "pid_t") == 0)
+ size = sizeof(pid_t);
+ else if (synth_field_is_string(type))
+ size = synth_field_string_size(type);
+
+ return size;
+}
+
+static const char *synth_field_fmt(char *type)
+{
+ const char *fmt = "%llu";
+
+ if (strcmp(type, "s64") == 0)
+ fmt = "%lld";
+ else if (strcmp(type, "u64") == 0)
+ fmt = "%llu";
+ else if (strcmp(type, "s32") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u32") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "s16") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u16") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "s8") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u8") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "char") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "unsigned char") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "int") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "unsigned int") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "long") == 0)
+ fmt = "%ld";
+ else if (strcmp(type, "unsigned long") == 0)
+ fmt = "%lu";
+ else if (strcmp(type, "pid_t") == 0)
+ fmt = "%d";
+ else if (synth_field_is_string(type))
+ fmt = "%s";
+
+ return fmt;
+}
+
+static enum print_line_t print_synth_event(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ struct trace_array *tr = iter->tr;
+ struct trace_seq *s = &iter->seq;
+ struct synth_trace_event *entry;
+ struct synth_event *se;
+ unsigned int i, n_u64;
+ char print_fmt[32];
+ const char *fmt;
+
+ entry = (struct synth_trace_event *)iter->ent;
+ se = container_of(event, struct synth_event, call.event);
+
+ trace_seq_printf(s, "%s: ", se->name);
+
+ for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
+ fmt = synth_field_fmt(se->fields[i]->type);
+
+ /* parameter types */
+ if (tr->trace_flags & TRACE_ITER_VERBOSE)
+ trace_seq_printf(s, "%s ", fmt);
+
+ snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
+
+ /* parameter values */
+ if (se->fields[i]->is_string) {
+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
+ (char *)&entry->fields[n_u64],
+ i == se->n_fields - 1 ? "" : " ");
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
+ entry->fields[n_u64],
+ i == se->n_fields - 1 ? "" : " ");
+ n_u64++;
+ }
+ }
+end:
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions synth_event_funcs = {
+ .trace = print_synth_event
+};
+
+static notrace void trace_event_raw_event_synth(void *__data,
+ u64 *var_ref_vals,
+ unsigned int var_ref_idx)
+{
+ struct trace_event_file *trace_file = __data;
+ struct synth_trace_event *entry;
+ struct trace_event_buffer fbuffer;
+ struct ring_buffer *buffer;
+ struct synth_event *event;
+ unsigned int i, n_u64;
+ int fields_size = 0;
+
+ event = trace_file->event_call->data;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ fields_size = event->n_u64 * sizeof(u64);
+
+ /*
+ * Avoid ring buffer recursion detection, as this event
+ * is being performed within another event.
+ */
+ buffer = trace_file->tr->trace_buffer.buffer;
+ ring_buffer_nest_start(buffer);
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + fields_size);
+ if (!entry)
+ goto out;
+
+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ if (event->fields[i]->is_string) {
+ char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
+ char *str_field = (char *)&entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
+ n_u64++;
+ }
+ }
+
+ trace_event_buffer_commit(&fbuffer);
+out:
+ ring_buffer_nest_end(buffer);
+}
+
+static void free_synth_event_print_fmt(struct trace_event_call *call)
+{
+ if (call) {
+ kfree(call->print_fmt);
+ call->print_fmt = NULL;
+ }
+}
+
+static int __set_synth_event_print_fmt(struct synth_event *event,
+ char *buf, int len)
+{
+ const char *fmt;
+ int pos = 0;
+ int i;
+
+ /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+ for (i = 0; i < event->n_fields; i++) {
+ fmt = synth_field_fmt(event->fields[i]->type);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
+ event->fields[i]->name, fmt,
+ i == event->n_fields - 1 ? "" : ", ");
+ }
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ for (i = 0; i < event->n_fields; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", event->fields[i]->name);
+ }
+
+#undef LEN_OR_ZERO
+
+ /* return the length of print_fmt */
+ return pos;
+}
+
+static int set_synth_event_print_fmt(struct trace_event_call *call)
+{
+ struct synth_event *event = call->data;
+ char *print_fmt;
+ int len;
+
+ /* First: called with 0 length to calculate the needed length */
+ len = __set_synth_event_print_fmt(event, NULL, 0);
+
+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
+ if (!print_fmt)
+ return -ENOMEM;
+
+ /* Second: actually write the @print_fmt */
+ __set_synth_event_print_fmt(event, print_fmt, len + 1);
+ call->print_fmt = print_fmt;
+
+ return 0;
+}
+
+static void free_synth_field(struct synth_field *field)
+{
+ kfree(field->type);
+ kfree(field->name);
+ kfree(field);
+}
+
+static struct synth_field *parse_synth_field(char *field_type,
+ char *field_name)
+{
+ struct synth_field *field;
+ int len, ret = 0;
+ char *array;
+
+ if (field_type[0] == ';')
+ field_type++;
+
+ len = strlen(field_name);
+ if (field_name[len - 1] == ';')
+ field_name[len - 1] = '\0';
+
+ field = kzalloc(sizeof(*field), GFP_KERNEL);
+ if (!field)
+ return ERR_PTR(-ENOMEM);
+
+ len = strlen(field_type) + 1;
+ array = strchr(field_name, '[');
+ if (array)
+ len += strlen(array);
+ field->type = kzalloc(len, GFP_KERNEL);
+ if (!field->type) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ strcat(field->type, field_type);
+ if (array) {
+ strcat(field->type, array);
+ *array = '\0';
+ }
+
+ field->size = synth_field_size(field->type);
+ if (!field->size) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ if (synth_field_is_string(field->type))
+ field->is_string = true;
+
+ field->is_signed = synth_field_signed(field->type);
+
+ field->name = kstrdup(field_name, GFP_KERNEL);
+ if (!field->name) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ out:
+ return field;
+ free:
+ free_synth_field(field);
+ field = ERR_PTR(ret);
+ goto out;
+}
+
+static void free_synth_tracepoint(struct tracepoint *tp)
+{
+ if (!tp)
+ return;
+
+ kfree(tp->name);
+ kfree(tp);
+}
+
+static struct tracepoint *alloc_synth_tracepoint(char *name)
+{
+ struct tracepoint *tp;
+
+ tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+ if (!tp)
+ return ERR_PTR(-ENOMEM);
+
+ tp->name = kstrdup(name, GFP_KERNEL);
+ if (!tp->name) {
+ kfree(tp);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return tp;
+}
+
+typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
+ unsigned int var_ref_idx);
+
+static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
+ unsigned int var_ref_idx)
+{
+ struct tracepoint *tp = event->tp;
+
+ if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
+ struct tracepoint_func *probe_func_ptr;
+ synth_probe_func_t probe_func;
+ void *__data;
+
+ if (!(cpu_online(raw_smp_processor_id())))
+ return;
+
+ probe_func_ptr = rcu_dereference_sched((tp)->funcs);
+ if (probe_func_ptr) {
+ do {
+ probe_func = probe_func_ptr->func;
+ __data = probe_func_ptr->data;
+ probe_func(__data, var_ref_vals, var_ref_idx);
+ } while ((++probe_func_ptr)->func);
+ }
+ }
+}
+
+static struct synth_event *find_synth_event(const char *name)
+{
+ struct synth_event *event;
+
+ list_for_each_entry(event, &synth_event_list, list) {
+ if (strcmp(event->name, name) == 0)
+ return event;
+ }
+
+ return NULL;
+}
+
+static int register_synth_event(struct synth_event *event)
+{
+ struct trace_event_call *call = &event->call;
+ int ret = 0;
+
+ event->call.class = &event->class;
+ event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
+ if (!event->class.system) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ event->tp = alloc_synth_tracepoint(event->name);
+ if (IS_ERR(event->tp)) {
+ ret = PTR_ERR(event->tp);
+ event->tp = NULL;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &synth_event_funcs;
+ call->class->define_fields = synth_event_define_fields;
+
+ ret = register_trace_event(&call->event);
+ if (!ret) {
+ ret = -ENODEV;
+ goto out;
+ }
+ call->flags = TRACE_EVENT_FL_TRACEPOINT;
+ call->class->reg = trace_event_reg;
+ call->class->probe = trace_event_raw_event_synth;
+ call->data = event;
+ call->tp = event->tp;
+
+ ret = trace_add_event_call(call);
+ if (ret) {
+ pr_warn("Failed to register synthetic event: %s\n",
+ trace_event_name(call));
+ goto err;
+ }
+
+ ret = set_synth_event_print_fmt(call);
+ if (ret < 0) {
+ trace_remove_event_call(call);
+ goto err;
+ }
+ out:
+ return ret;
+ err:
+ unregister_trace_event(&call->event);
+ goto out;
+}
+
+static int unregister_synth_event(struct synth_event *event)
+{
+ struct trace_event_call *call = &event->call;
+ int ret;
+
+ ret = trace_remove_event_call(call);
+
+ return ret;
+}
+
+static void free_synth_event(struct synth_event *event)
+{
+ unsigned int i;
+
+ if (!event)
+ return;
+
+ for (i = 0; i < event->n_fields; i++)
+ free_synth_field(event->fields[i]);
+
+ kfree(event->fields);
+ kfree(event->name);
+ kfree(event->class.system);
+ free_synth_tracepoint(event->tp);
+ free_synth_event_print_fmt(&event->call);
+ kfree(event);
+}
+
+static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
+ struct synth_field **fields)
+{
+ struct synth_event *event;
+ unsigned int i;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event) {
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ event->name = kstrdup(event_name, GFP_KERNEL);
+ if (!event->name) {
+ kfree(event);
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
+ if (!event->fields) {
+ free_synth_event(event);
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ for (i = 0; i < n_fields; i++)
+ event->fields[i] = fields[i];
+
+ event->n_fields = n_fields;
+ out:
+ return event;
+}
+
+static void action_trace(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe,
+ struct action_data *data, u64 *var_ref_vals)
+{
+ struct synth_event *event = data->onmatch.synth_event;
+
+ trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
+}
+
+struct hist_var_data {
+ struct list_head list;
+ struct hist_trigger_data *hist_data;
+};
+
+static void add_or_delete_synth_event(struct synth_event *event, int delete)
+{
+ if (delete)
+ free_synth_event(event);
+ else {
+ mutex_lock(&synth_event_mutex);
+ if (!find_synth_event(event->name))
+ list_add(&event->list, &synth_event_list);
+ else
+ free_synth_event(event);
+ mutex_unlock(&synth_event_mutex);
+ }
+}
+
+static int create_synth_event(int argc, char **argv)
+{
+ struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
+ struct synth_event *event = NULL;
+ bool delete_event = false;
+ int i, n_fields = 0, ret = 0;
+ char *name;
+
+ mutex_lock(&synth_event_mutex);
+
+ /*
+ * Argument syntax:
+ * - Add synthetic event: <event_name> field[;field] ...
+ * - Remove synthetic event: !<event_name> field[;field] ...
+ * where 'field' = type field_name
+ */
+ if (argc < 1) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ name = argv[0];
+ if (name[0] == '!') {
+ delete_event = true;
+ name++;
+ }
+
+ event = find_synth_event(name);
+ if (event) {
+ if (delete_event) {
+ if (event->ref) {
+ event = NULL;
+ ret = -EBUSY;
+ goto out;
+ }
+ list_del(&event->list);
+ goto out;
+ }
+ event = NULL;
+ ret = -EEXIST;
+ goto out;
+ } else if (delete_event)
+ goto out;
+
+ if (argc < 2) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 1; i < argc - 1; i++) {
+ if (strcmp(argv[i], ";") == 0)
+ continue;
+ if (n_fields == SYNTH_FIELDS_MAX) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ field = parse_synth_field(argv[i], argv[i + 1]);
+ if (IS_ERR(field)) {
+ ret = PTR_ERR(field);
+ goto err;
+ }
+ fields[n_fields] = field;
+ i++; n_fields++;
+ }
+
+ if (i < argc) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ event = alloc_synth_event(name, n_fields, fields);
+ if (IS_ERR(event)) {
+ ret = PTR_ERR(event);
+ event = NULL;
+ goto err;
+ }
+ out:
+ mutex_unlock(&synth_event_mutex);
+
+ if (event) {
+ if (delete_event) {
+ ret = unregister_synth_event(event);
+ add_or_delete_synth_event(event, !ret);
+ } else {
+ ret = register_synth_event(event);
+ add_or_delete_synth_event(event, ret);
+ }
+ }
+
+ return ret;
+ err:
+ mutex_unlock(&synth_event_mutex);
+
+ for (i = 0; i < n_fields; i++)
+ free_synth_field(fields[i]);
+ free_synth_event(event);
+
+ return ret;
+}
+
+static int release_all_synth_events(void)
+{
+ struct list_head release_events;
+ struct synth_event *event, *e;
+ int ret = 0;
+
+ INIT_LIST_HEAD(&release_events);
+
+ mutex_lock(&synth_event_mutex);
+
+ list_for_each_entry(event, &synth_event_list, list) {
+ if (event->ref) {
+ mutex_unlock(&synth_event_mutex);
+ return -EBUSY;
+ }
+ }
+
+ list_splice_init(&event->list, &release_events);
+
+ mutex_unlock(&synth_event_mutex);
+
+ list_for_each_entry_safe(event, e, &release_events, list) {
+ list_del(&event->list);
+
+ ret = unregister_synth_event(event);
+ add_or_delete_synth_event(event, !ret);
+ }
+
+ return ret;
+}
+
+
+static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&synth_event_mutex);
+
+ return seq_list_start(&synth_event_list, *pos);
+}
+
+static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &synth_event_list, pos);
+}
+
+static void synth_events_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&synth_event_mutex);
+}
+
+static int synth_events_seq_show(struct seq_file *m, void *v)
+{
+ struct synth_field *field;
+ struct synth_event *event = v;
+ unsigned int i;
+
+ seq_printf(m, "%s\t", event->name);
+
+ for (i = 0; i < event->n_fields; i++) {
+ field = event->fields[i];
+
+ /* parameter values */
+ seq_printf(m, "%s %s%s", field->type, field->name,
+ i == event->n_fields - 1 ? "" : "; ");
+ }
+
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations synth_events_seq_op = {
+ .start = synth_events_seq_start,
+ .next = synth_events_seq_next,
+ .stop = synth_events_seq_stop,
+ .show = synth_events_seq_show
+};
+
+static int synth_events_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = release_all_synth_events();
+ if (ret < 0)
+ return ret;
+ }
+
+ return seq_open(file, &synth_events_seq_op);
+}
+
+static ssize_t synth_events_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return trace_parse_run_command(file, buffer, count, ppos,
+ create_synth_event);
+}
+
+static const struct file_operations synth_events_fops = {
+ .open = synth_events_open,
+ .write = synth_events_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static u64 hist_field_timestamp(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_trigger_data *hist_data = hist_field->hist_data;
+ struct trace_array *tr = hist_data->event_file->tr;
+
+ u64 ts = ring_buffer_event_time_stamp(rbe);
+
+ if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
+ ts = ns2usecs(ts);
+
+ return ts;
+}
+
+static u64 hist_field_cpu(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ int cpu = smp_processor_id();
+
+ return cpu;
+}
+
+static struct hist_field *
+check_field_for_var_ref(struct hist_field *hist_field,
+ struct hist_trigger_data *var_data,
+ unsigned int var_idx)
+{
+ struct hist_field *found = NULL;
+
+ if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
+ if (hist_field->var.idx == var_idx &&
+ hist_field->var.hist_data == var_data) {
+ found = hist_field;
+ }
+ }
+
+ return found;
+}
+
+static struct hist_field *
+check_field_for_var_refs(struct hist_trigger_data *hist_data,
+ struct hist_field *hist_field,
+ struct hist_trigger_data *var_data,
+ unsigned int var_idx,
+ unsigned int level)
+{
+ struct hist_field *found = NULL;
+ unsigned int i;
+
+ if (level > 3)
+ return found;
+
+ if (!hist_field)
+ return found;
+
+ found = check_field_for_var_ref(hist_field, var_data, var_idx);
+ if (found)
+ return found;
+
+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
+ struct hist_field *operand;
+
+ operand = hist_field->operands[i];
+ found = check_field_for_var_refs(hist_data, operand, var_data,
+ var_idx, level + 1);
+ if (found)
+ return found;
+ }
+
+ return found;
+}
+
+static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
+ struct hist_trigger_data *var_data,
+ unsigned int var_idx)
+{
+ struct hist_field *hist_field, *found = NULL;
+ unsigned int i;
+
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
+ found = check_field_for_var_refs(hist_data, hist_field,
+ var_data, var_idx, 0);
+ if (found)
+ return found;
+ }
+
+ for (i = 0; i < hist_data->n_synth_var_refs; i++) {
+ hist_field = hist_data->synth_var_refs[i];
+ found = check_field_for_var_refs(hist_data, hist_field,
+ var_data, var_idx, 0);
+ if (found)
+ return found;
+ }
+
+ return found;
+}
+
+static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
+ unsigned int var_idx)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_field *found = NULL;
+ struct hist_var_data *var_data;
+
+ list_for_each_entry(var_data, &tr->hist_vars, list) {
+ if (var_data->hist_data == hist_data)
+ continue;
+ found = find_var_ref(var_data->hist_data, hist_data, var_idx);
+ if (found)
+ break;
+ }
+
+ return found;
+}
+
+static bool check_var_refs(struct hist_trigger_data *hist_data)
+{
+ struct hist_field *field;
+ bool found = false;
+ int i;
+
+ for_each_hist_field(i, hist_data) {
+ field = hist_data->fields[i];
+ if (field && field->flags & HIST_FIELD_FL_VAR) {
+ if (find_any_var_ref(hist_data, field->var.idx)) {
+ found = true;
+ break;
+ }
+ }
+ }
+
+ return found;
+}
+
+static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_var_data *var_data, *found = NULL;
+
+ list_for_each_entry(var_data, &tr->hist_vars, list) {
+ if (var_data->hist_data == hist_data) {
+ found = var_data;
+ break;
+ }
+ }
+
+ return found;
+}
+
+static bool field_has_hist_vars(struct hist_field *hist_field,
+ unsigned int level)
+{
+ int i;
+
+ if (level > 3)
+ return false;
+
+ if (!hist_field)
+ return false;
+
+ if (hist_field->flags & HIST_FIELD_FL_VAR ||
+ hist_field->flags & HIST_FIELD_FL_VAR_REF)
+ return true;
+
+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
+ struct hist_field *operand;
+
+ operand = hist_field->operands[i];
+ if (field_has_hist_vars(operand, level + 1))
+ return true;
+ }
+
+ return false;
+}
+
+static bool has_hist_vars(struct hist_trigger_data *hist_data)
+{
+ struct hist_field *hist_field;
+ int i;
+
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
+ if (field_has_hist_vars(hist_field, 0))
+ return true;
+ }
+
+ return false;
+}
+
+static int save_hist_vars(struct hist_trigger_data *hist_data)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_var_data *var_data;
+
+ var_data = find_hist_vars(hist_data);
+ if (var_data)
+ return 0;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
+ if (!var_data) {
+ trace_array_put(tr);
+ return -ENOMEM;
+ }
+
+ var_data->hist_data = hist_data;
+ list_add(&var_data->list, &tr->hist_vars);
+
+ return 0;
+}
+
+static void remove_hist_vars(struct hist_trigger_data *hist_data)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_var_data *var_data;
+
+ var_data = find_hist_vars(hist_data);
+ if (!var_data)
+ return;
+
+ if (WARN_ON(check_var_refs(hist_data)))
+ return;
+
+ list_del(&var_data->list);
+
+ kfree(var_data);
+
+ trace_array_put(tr);
+}
+
+static struct hist_field *find_var_field(struct hist_trigger_data *hist_data,
+ const char *var_name)
+{
+ struct hist_field *hist_field, *found = NULL;
+ int i;
+
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
+ if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR &&
+ strcmp(hist_field->var.name, var_name) == 0) {
+ found = hist_field;
+ break;
+ }
+ }
+
+ return found;
+}
+
+static struct hist_field *find_var(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ const char *var_name)
+{
+ struct hist_trigger_data *test_data;
+ struct event_trigger_data *test;
+ struct hist_field *hist_field;
+
+ hist_field = find_var_field(hist_data, var_name);
+ if (hist_field)
+ return hist_field;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ test_data = test->private_data;
+ hist_field = find_var_field(test_data, var_name);
+ if (hist_field)
+ return hist_field;
+ }
+ }
+
+ return NULL;
+}
+
+static struct trace_event_file *find_var_file(struct trace_array *tr,
+ char *system,
+ char *event_name,
+ char *var_name)
+{
+ struct hist_trigger_data *var_hist_data;
+ struct hist_var_data *var_data;
+ struct trace_event_file *file, *found = NULL;
+
+ if (system)
+ return find_event_file(tr, system, event_name);
+
+ list_for_each_entry(var_data, &tr->hist_vars, list) {
+ var_hist_data = var_data->hist_data;
+ file = var_hist_data->event_file;
+ if (file == found)
+ continue;
+
+ if (find_var_field(var_hist_data, var_name)) {
+ if (found) {
+ hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+ return NULL;
+ }
+
+ found = file;
+ }
+ }
+
+ return found;
+}
+
+static struct hist_field *find_file_var(struct trace_event_file *file,
+ const char *var_name)
+{
+ struct hist_trigger_data *test_data;
+ struct event_trigger_data *test;
+ struct hist_field *hist_field;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ test_data = test->private_data;
+ hist_field = find_var_field(test_data, var_name);
+ if (hist_field)
+ return hist_field;
+ }
+ }
+
+ return NULL;
+}
+
+static struct hist_field *
+find_match_var(struct hist_trigger_data *hist_data, char *var_name)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_field *hist_field, *found = NULL;
+ struct trace_event_file *file;
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+
+ if (data->fn == action_trace) {
+ char *system = data->onmatch.match_event_system;
+ char *event_name = data->onmatch.match_event;
+
+ file = find_var_file(tr, system, event_name, var_name);
+ if (!file)
+ continue;
+ hist_field = find_file_var(file, var_name);
+ if (hist_field) {
+ if (found) {
+ hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ found = hist_field;
+ }
+ }
+ }
+ return found;
+}
+
+static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
+ char *system,
+ char *event_name,
+ char *var_name)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_field *hist_field = NULL;
+ struct trace_event_file *file;
+
+ if (!system || !event_name) {
+ hist_field = find_match_var(hist_data, var_name);
+ if (IS_ERR(hist_field))
+ return NULL;
+ if (hist_field)
+ return hist_field;
+ }
+
+ file = find_var_file(tr, system, event_name, var_name);
+ if (!file)
+ return NULL;
+
+ hist_field = find_file_var(file, var_name);
+
+ return hist_field;
+}
+
+struct hist_elt_data {
+ char *comm;
+ u64 *var_ref_vals;
+ char *field_var_str[SYNTH_FIELDS_MAX];
};
+static u64 hist_field_var_ref(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_elt_data *elt_data;
+ u64 var_val = 0;
+
+ elt_data = elt->private_data;
+ var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
+
+ return var_val;
+}
+
+static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key,
+ u64 *var_ref_vals, bool self)
+{
+ struct hist_trigger_data *var_data;
+ struct tracing_map_elt *var_elt;
+ struct hist_field *hist_field;
+ unsigned int i, var_idx;
+ bool resolved = true;
+ u64 var_val = 0;
+
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ hist_field = hist_data->var_refs[i];
+ var_idx = hist_field->var.idx;
+ var_data = hist_field->var.hist_data;
+
+ if (var_data == NULL) {
+ resolved = false;
+ break;
+ }
+
+ if ((self && var_data != hist_data) ||
+ (!self && var_data == hist_data))
+ continue;
+
+ var_elt = tracing_map_lookup(var_data->map, key);
+ if (!var_elt) {
+ resolved = false;
+ break;
+ }
+
+ if (!tracing_map_var_set(var_elt, var_idx)) {
+ resolved = false;
+ break;
+ }
+
+ if (self || !hist_field->read_once)
+ var_val = tracing_map_read_var(var_elt, var_idx);
+ else
+ var_val = tracing_map_read_var_once(var_elt, var_idx);
+
+ var_ref_vals[i] = var_val;
+ }
+
+ return resolved;
+}
+
static const char *hist_field_name(struct hist_field *field,
unsigned int level)
{
@@ -162,8 +1683,26 @@ static const char *hist_field_name(struct hist_field *field,
if (field->field)
field_name = field->field->name;
- else if (field->flags & HIST_FIELD_FL_LOG2)
+ else if (field->flags & HIST_FIELD_FL_LOG2 ||
+ field->flags & HIST_FIELD_FL_ALIAS)
field_name = hist_field_name(field->operands[0], ++level);
+ else if (field->flags & HIST_FIELD_FL_CPU)
+ field_name = "cpu";
+ else if (field->flags & HIST_FIELD_FL_EXPR ||
+ field->flags & HIST_FIELD_FL_VAR_REF) {
+ if (field->system) {
+ static char full_name[MAX_FILTER_STR_VAL];
+
+ strcat(full_name, field->system);
+ strcat(full_name, ".");
+ strcat(full_name, field->event_name);
+ strcat(full_name, ".");
+ strcat(full_name, field->name);
+ field_name = full_name;
+ } else
+ field_name = field->name;
+ } else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
+ field_name = "common_timestamp";
if (field_name == NULL)
field_name = "";
@@ -232,16 +1771,119 @@ static int parse_map_size(char *str)
static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
{
+ unsigned int i;
+
if (!attrs)
return;
+ for (i = 0; i < attrs->n_assignments; i++)
+ kfree(attrs->assignment_str[i]);
+
+ for (i = 0; i < attrs->n_actions; i++)
+ kfree(attrs->action_str[i]);
+
kfree(attrs->name);
kfree(attrs->sort_key_str);
kfree(attrs->keys_str);
kfree(attrs->vals_str);
+ kfree(attrs->clock);
kfree(attrs);
}
+static int parse_action(char *str, struct hist_trigger_attrs *attrs)
+{
+ int ret = -EINVAL;
+
+ if (attrs->n_actions >= HIST_ACTIONS_MAX)
+ return ret;
+
+ if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
+ (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
+ attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
+ if (!attrs->action_str[attrs->n_actions]) {
+ ret = -ENOMEM;
+ return ret;
+ }
+ attrs->n_actions++;
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
+{
+ int ret = 0;
+
+ if ((strncmp(str, "key=", strlen("key=")) == 0) ||
+ (strncmp(str, "keys=", strlen("keys=")) == 0)) {
+ attrs->keys_str = kstrdup(str, GFP_KERNEL);
+ if (!attrs->keys_str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
+ (strncmp(str, "vals=", strlen("vals=")) == 0) ||
+ (strncmp(str, "values=", strlen("values=")) == 0)) {
+ attrs->vals_str = kstrdup(str, GFP_KERNEL);
+ if (!attrs->vals_str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
+ attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+ if (!attrs->sort_key_str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "name=", strlen("name=")) == 0) {
+ attrs->name = kstrdup(str, GFP_KERNEL);
+ if (!attrs->name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
+ strsep(&str, "=");
+ if (!str) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ str = strstrip(str);
+ attrs->clock = kstrdup(str, GFP_KERNEL);
+ if (!attrs->clock) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "size=", strlen("size=")) == 0) {
+ int map_bits = parse_map_size(str);
+
+ if (map_bits < 0) {
+ ret = map_bits;
+ goto out;
+ }
+ attrs->map_bits = map_bits;
+ } else {
+ char *assignment;
+
+ if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
+ hist_err("Too many variables defined: ", str);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ assignment = kstrdup(str, GFP_KERNEL);
+ if (!assignment) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ attrs->assignment_str[attrs->n_assignments++] = assignment;
+ }
+ out:
+ return ret;
+}
+
static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
{
struct hist_trigger_attrs *attrs;
@@ -254,35 +1896,21 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
while (trigger_str) {
char *str = strsep(&trigger_str, ":");
- if ((strncmp(str, "key=", strlen("key=")) == 0) ||
- (strncmp(str, "keys=", strlen("keys=")) == 0))
- attrs->keys_str = kstrdup(str, GFP_KERNEL);
- else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
- (strncmp(str, "vals=", strlen("vals=")) == 0) ||
- (strncmp(str, "values=", strlen("values=")) == 0))
- attrs->vals_str = kstrdup(str, GFP_KERNEL);
- else if (strncmp(str, "sort=", strlen("sort=")) == 0)
- attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
- else if (strncmp(str, "name=", strlen("name=")) == 0)
- attrs->name = kstrdup(str, GFP_KERNEL);
- else if (strcmp(str, "pause") == 0)
+ if (strchr(str, '=')) {
+ ret = parse_assignment(str, attrs);
+ if (ret)
+ goto free;
+ } else if (strcmp(str, "pause") == 0)
attrs->pause = true;
else if ((strcmp(str, "cont") == 0) ||
(strcmp(str, "continue") == 0))
attrs->cont = true;
else if (strcmp(str, "clear") == 0)
attrs->clear = true;
- else if (strncmp(str, "size=", strlen("size=")) == 0) {
- int map_bits = parse_map_size(str);
-
- if (map_bits < 0) {
- ret = map_bits;
+ else {
+ ret = parse_action(str, attrs);
+ if (ret)
goto free;
- }
- attrs->map_bits = map_bits;
- } else {
- ret = -EINVAL;
- goto free;
}
}
@@ -291,6 +1919,14 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
goto free;
}
+ if (!attrs->clock) {
+ attrs->clock = kstrdup("global", GFP_KERNEL);
+ if (!attrs->clock) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ }
+
return attrs;
free:
destroy_hist_trigger_attrs(attrs);
@@ -313,64 +1949,203 @@ static inline void save_comm(char *comm, struct task_struct *task)
memcpy(comm, task->comm, TASK_COMM_LEN);
}
-static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
+static void hist_elt_data_free(struct hist_elt_data *elt_data)
{
- kfree((char *)elt->private_data);
+ unsigned int i;
+
+ for (i = 0; i < SYNTH_FIELDS_MAX; i++)
+ kfree(elt_data->field_var_str[i]);
+
+ kfree(elt_data->comm);
+ kfree(elt_data);
}
-static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
+static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
+{
+ struct hist_elt_data *elt_data = elt->private_data;
+
+ hist_elt_data_free(elt_data);
+}
+
+static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
{
struct hist_trigger_data *hist_data = elt->map->private_data;
+ unsigned int size = TASK_COMM_LEN;
+ struct hist_elt_data *elt_data;
struct hist_field *key_field;
- unsigned int i;
+ unsigned int i, n_str;
+
+ elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
+ if (!elt_data)
+ return -ENOMEM;
for_each_hist_key_field(i, hist_data) {
key_field = hist_data->fields[i];
if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
- unsigned int size = TASK_COMM_LEN + 1;
-
- elt->private_data = kzalloc(size, GFP_KERNEL);
- if (!elt->private_data)
+ elt_data->comm = kzalloc(size, GFP_KERNEL);
+ if (!elt_data->comm) {
+ kfree(elt_data);
return -ENOMEM;
+ }
break;
}
}
+ n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
+
+ size = STR_VAR_LEN_MAX;
+
+ for (i = 0; i < n_str; i++) {
+ elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
+ if (!elt_data->field_var_str[i]) {
+ hist_elt_data_free(elt_data);
+ return -ENOMEM;
+ }
+ }
+
+ elt->private_data = elt_data;
+
return 0;
}
-static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
- struct tracing_map_elt *from)
+static void hist_trigger_elt_data_init(struct tracing_map_elt *elt)
+{
+ struct hist_elt_data *elt_data = elt->private_data;
+
+ if (elt_data->comm)
+ save_comm(elt_data->comm, current);
+}
+
+static const struct tracing_map_ops hist_trigger_elt_data_ops = {
+ .elt_alloc = hist_trigger_elt_data_alloc,
+ .elt_free = hist_trigger_elt_data_free,
+ .elt_init = hist_trigger_elt_data_init,
+};
+
+static const char *get_hist_field_flags(struct hist_field *hist_field)
+{
+ const char *flags_str = NULL;
+
+ if (hist_field->flags & HIST_FIELD_FL_HEX)
+ flags_str = "hex";
+ else if (hist_field->flags & HIST_FIELD_FL_SYM)
+ flags_str = "sym";
+ else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
+ flags_str = "sym-offset";
+ else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
+ flags_str = "execname";
+ else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
+ flags_str = "syscall";
+ else if (hist_field->flags & HIST_FIELD_FL_LOG2)
+ flags_str = "log2";
+ else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
+ flags_str = "usecs";
+
+ return flags_str;
+}
+
+static void expr_field_str(struct hist_field *field, char *expr)
{
- char *comm_from = from->private_data;
- char *comm_to = to->private_data;
+ if (field->flags & HIST_FIELD_FL_VAR_REF)
+ strcat(expr, "$");
+
+ strcat(expr, hist_field_name(field, 0));
- if (comm_from)
- memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
+ if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
+ const char *flags_str = get_hist_field_flags(field);
+
+ if (flags_str) {
+ strcat(expr, ".");
+ strcat(expr, flags_str);
+ }
+ }
}
-static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
+static char *expr_str(struct hist_field *field, unsigned int level)
{
- char *comm = elt->private_data;
+ char *expr;
+
+ if (level > 1)
+ return NULL;
+
+ expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
+ if (!expr)
+ return NULL;
+
+ if (!field->operands[0]) {
+ expr_field_str(field, expr);
+ return expr;
+ }
+
+ if (field->operator == FIELD_OP_UNARY_MINUS) {
+ char *subexpr;
- if (comm)
- save_comm(comm, current);
+ strcat(expr, "-(");
+ subexpr = expr_str(field->operands[0], ++level);
+ if (!subexpr) {
+ kfree(expr);
+ return NULL;
+ }
+ strcat(expr, subexpr);
+ strcat(expr, ")");
+
+ kfree(subexpr);
+
+ return expr;
+ }
+
+ expr_field_str(field->operands[0], expr);
+
+ switch (field->operator) {
+ case FIELD_OP_MINUS:
+ strcat(expr, "-");
+ break;
+ case FIELD_OP_PLUS:
+ strcat(expr, "+");
+ break;
+ default:
+ kfree(expr);
+ return NULL;
+ }
+
+ expr_field_str(field->operands[1], expr);
+
+ return expr;
}
-static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
- .elt_alloc = hist_trigger_elt_comm_alloc,
- .elt_copy = hist_trigger_elt_comm_copy,
- .elt_free = hist_trigger_elt_comm_free,
- .elt_init = hist_trigger_elt_comm_init,
-};
+static int contains_operator(char *str)
+{
+ enum field_op_id field_op = FIELD_OP_NONE;
+ char *op;
+
+ op = strpbrk(str, "+-");
+ if (!op)
+ return FIELD_OP_NONE;
+
+ switch (*op) {
+ case '-':
+ if (*str == '-')
+ field_op = FIELD_OP_UNARY_MINUS;
+ else
+ field_op = FIELD_OP_MINUS;
+ break;
+ case '+':
+ field_op = FIELD_OP_PLUS;
+ break;
+ default:
+ break;
+ }
+
+ return field_op;
+}
static void destroy_hist_field(struct hist_field *hist_field,
unsigned int level)
{
unsigned int i;
- if (level > 2)
+ if (level > 3)
return;
if (!hist_field)
@@ -379,11 +2154,17 @@ static void destroy_hist_field(struct hist_field *hist_field,
for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
destroy_hist_field(hist_field->operands[i], level + 1);
+ kfree(hist_field->var.name);
+ kfree(hist_field->name);
+ kfree(hist_field->type);
+
kfree(hist_field);
}
-static struct hist_field *create_hist_field(struct ftrace_event_field *field,
- unsigned long flags)
+static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
+ struct ftrace_event_field *field,
+ unsigned long flags,
+ char *var_name)
{
struct hist_field *hist_field;
@@ -394,8 +2175,22 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
if (!hist_field)
return NULL;
+ hist_field->hist_data = hist_data;
+
+ if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
+ goto out; /* caller will populate */
+
+ if (flags & HIST_FIELD_FL_VAR_REF) {
+ hist_field->fn = hist_field_var_ref;
+ goto out;
+ }
+
if (flags & HIST_FIELD_FL_HITCOUNT) {
hist_field->fn = hist_field_counter;
+ hist_field->size = sizeof(u64);
+ hist_field->type = kstrdup("u64", GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
goto out;
}
@@ -407,8 +2202,29 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
if (flags & HIST_FIELD_FL_LOG2) {
unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
hist_field->fn = hist_field_log2;
- hist_field->operands[0] = create_hist_field(field, fl);
+ hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
hist_field->size = hist_field->operands[0]->size;
+ hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
+ goto out;
+ }
+
+ if (flags & HIST_FIELD_FL_TIMESTAMP) {
+ hist_field->fn = hist_field_timestamp;
+ hist_field->size = sizeof(u64);
+ hist_field->type = kstrdup("u64", GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
+ goto out;
+ }
+
+ if (flags & HIST_FIELD_FL_CPU) {
+ hist_field->fn = hist_field_cpu;
+ hist_field->size = sizeof(int);
+ hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
goto out;
}
@@ -418,6 +2234,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
if (is_string_field(field)) {
flags |= HIST_FIELD_FL_STRING;
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
+
if (field->filter_type == FILTER_STATIC_STRING)
hist_field->fn = hist_field_string;
else if (field->filter_type == FILTER_DYN_STRING)
@@ -425,6 +2246,12 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
else
hist_field->fn = hist_field_pstring;
} else {
+ hist_field->size = field->size;
+ hist_field->is_signed = field->is_signed;
+ hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
+
hist_field->fn = select_value_fn(field->size,
field->is_signed);
if (!hist_field->fn) {
@@ -436,14 +2263,23 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
hist_field->field = field;
hist_field->flags = flags;
+ if (var_name) {
+ hist_field->var.name = kstrdup(var_name, GFP_KERNEL);
+ if (!hist_field->var.name)
+ goto free;
+ }
+
return hist_field;
+ free:
+ destroy_hist_field(hist_field, 0);
+ return NULL;
}
static void destroy_hist_fields(struct hist_trigger_data *hist_data)
{
unsigned int i;
- for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
+ for (i = 0; i < HIST_FIELDS_MAX; i++) {
if (hist_data->fields[i]) {
destroy_hist_field(hist_data->fields[i], 0);
hist_data->fields[i] = NULL;
@@ -451,69 +2287,1612 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
}
}
-static int create_hitcount_val(struct hist_trigger_data *hist_data)
+static int init_var_ref(struct hist_field *ref_field,
+ struct hist_field *var_field,
+ char *system, char *event_name)
{
- hist_data->fields[HITCOUNT_IDX] =
- create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
- if (!hist_data->fields[HITCOUNT_IDX])
- return -ENOMEM;
+ int err = 0;
+
+ ref_field->var.idx = var_field->var.idx;
+ ref_field->var.hist_data = var_field->hist_data;
+ ref_field->size = var_field->size;
+ ref_field->is_signed = var_field->is_signed;
+ ref_field->flags |= var_field->flags &
+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
+
+ if (system) {
+ ref_field->system = kstrdup(system, GFP_KERNEL);
+ if (!ref_field->system)
+ return -ENOMEM;
+ }
- hist_data->n_vals++;
+ if (event_name) {
+ ref_field->event_name = kstrdup(event_name, GFP_KERNEL);
+ if (!ref_field->event_name) {
+ err = -ENOMEM;
+ goto free;
+ }
+ }
- if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+ if (var_field->var.name) {
+ ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL);
+ if (!ref_field->name) {
+ err = -ENOMEM;
+ goto free;
+ }
+ } else if (var_field->name) {
+ ref_field->name = kstrdup(var_field->name, GFP_KERNEL);
+ if (!ref_field->name) {
+ err = -ENOMEM;
+ goto free;
+ }
+ }
+
+ ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
+ if (!ref_field->type) {
+ err = -ENOMEM;
+ goto free;
+ }
+ out:
+ return err;
+ free:
+ kfree(ref_field->system);
+ kfree(ref_field->event_name);
+ kfree(ref_field->name);
+
+ goto out;
+}
+
+static struct hist_field *create_var_ref(struct hist_field *var_field,
+ char *system, char *event_name)
+{
+ unsigned long flags = HIST_FIELD_FL_VAR_REF;
+ struct hist_field *ref_field;
+
+ ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
+ if (ref_field) {
+ if (init_var_ref(ref_field, var_field, system, event_name)) {
+ destroy_hist_field(ref_field, 0);
+ return NULL;
+ }
+ }
+
+ return ref_field;
+}
+
+static bool is_var_ref(char *var_name)
+{
+ if (!var_name || strlen(var_name) < 2 || var_name[0] != '$')
+ return false;
+
+ return true;
+}
+
+static char *field_name_from_var(struct hist_trigger_data *hist_data,
+ char *var_name)
+{
+ char *name, *field;
+ unsigned int i;
+
+ for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
+ name = hist_data->attrs->var_defs.name[i];
+
+ if (strcmp(var_name, name) == 0) {
+ field = hist_data->attrs->var_defs.expr[i];
+ if (contains_operator(field) || is_var_ref(field))
+ continue;
+ return field;
+ }
+ }
+
+ return NULL;
+}
+
+static char *local_field_var_ref(struct hist_trigger_data *hist_data,
+ char *system, char *event_name,
+ char *var_name)
+{
+ struct trace_event_call *call;
+
+ if (system && event_name) {
+ call = hist_data->event_file->event_call;
+
+ if (strcmp(system, call->class->system) != 0)
+ return NULL;
+
+ if (strcmp(event_name, trace_event_name(call)) != 0)
+ return NULL;
+ }
+
+ if (!!system != !!event_name)
+ return NULL;
+
+ if (!is_var_ref(var_name))
+ return NULL;
+
+ var_name++;
+
+ return field_name_from_var(hist_data, var_name);
+}
+
+static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
+ char *system, char *event_name,
+ char *var_name)
+{
+ struct hist_field *var_field = NULL, *ref_field = NULL;
+
+ if (!is_var_ref(var_name))
+ return NULL;
+
+ var_name++;
+
+ var_field = find_event_var(hist_data, system, event_name, var_name);
+ if (var_field)
+ ref_field = create_var_ref(var_field, system, event_name);
+
+ if (!ref_field)
+ hist_err_event("Couldn't find variable: $",
+ system, event_name, var_name);
+
+ return ref_field;
+}
+
+static struct ftrace_event_field *
+parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
+ char *field_str, unsigned long *flags)
+{
+ struct ftrace_event_field *field = NULL;
+ char *field_name, *modifier, *str;
+
+ modifier = str = kstrdup(field_str, GFP_KERNEL);
+ if (!modifier)
+ return ERR_PTR(-ENOMEM);
+
+ field_name = strsep(&modifier, ".");
+ if (modifier) {
+ if (strcmp(modifier, "hex") == 0)
+ *flags |= HIST_FIELD_FL_HEX;
+ else if (strcmp(modifier, "sym") == 0)
+ *flags |= HIST_FIELD_FL_SYM;
+ else if (strcmp(modifier, "sym-offset") == 0)
+ *flags |= HIST_FIELD_FL_SYM_OFFSET;
+ else if ((strcmp(modifier, "execname") == 0) &&
+ (strcmp(field_name, "common_pid") == 0))
+ *flags |= HIST_FIELD_FL_EXECNAME;
+ else if (strcmp(modifier, "syscall") == 0)
+ *flags |= HIST_FIELD_FL_SYSCALL;
+ else if (strcmp(modifier, "log2") == 0)
+ *flags |= HIST_FIELD_FL_LOG2;
+ else if (strcmp(modifier, "usecs") == 0)
+ *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
+ else {
+ hist_err("Invalid field modifier: ", modifier);
+ field = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ }
+
+ if (strcmp(field_name, "common_timestamp") == 0) {
+ *flags |= HIST_FIELD_FL_TIMESTAMP;
+ hist_data->enable_timestamps = true;
+ if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
+ hist_data->attrs->ts_in_usecs = true;
+ } else if (strcmp(field_name, "cpu") == 0)
+ *flags |= HIST_FIELD_FL_CPU;
+ else {
+ field = trace_find_event_field(file->event_call, field_name);
+ if (!field || !field->size) {
+ hist_err("Couldn't find field: ", field_name);
+ field = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ }
+ out:
+ kfree(str);
+
+ return field;
+}
+
+static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
+ struct hist_field *var_ref,
+ char *var_name)
+{
+ struct hist_field *alias = NULL;
+ unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR;
+
+ alias = create_hist_field(hist_data, NULL, flags, var_name);
+ if (!alias)
+ return NULL;
+
+ alias->fn = var_ref->fn;
+ alias->operands[0] = var_ref;
+
+ if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
+ destroy_hist_field(alias, 0);
+ return NULL;
+ }
+
+ return alias;
+}
+
+static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file, char *str,
+ unsigned long *flags, char *var_name)
+{
+ char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
+ struct ftrace_event_field *field = NULL;
+ struct hist_field *hist_field = NULL;
+ int ret = 0;
+
+ s = strchr(str, '.');
+ if (s) {
+ s = strchr(++s, '.');
+ if (s) {
+ ref_system = strsep(&str, ".");
+ if (!str) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ref_event = strsep(&str, ".");
+ if (!str) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ref_var = str;
+ }
+ }
+
+ s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
+ if (!s) {
+ hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
+ if (hist_field) {
+ hist_data->var_refs[hist_data->n_var_refs] = hist_field;
+ hist_field->var_ref_idx = hist_data->n_var_refs++;
+ if (var_name) {
+ hist_field = create_alias(hist_data, hist_field, var_name);
+ if (!hist_field) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ return hist_field;
+ }
+ } else
+ str = s;
+
+ field = parse_field(hist_data, file, str, flags);
+ if (IS_ERR(field)) {
+ ret = PTR_ERR(field);
+ goto out;
+ }
+
+ hist_field = create_hist_field(hist_data, field, *flags, var_name);
+ if (!hist_field) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ return hist_field;
+ out:
+ return ERR_PTR(ret);
+}
+
+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ char *str, unsigned long flags,
+ char *var_name, unsigned int level);
+
+static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ char *str, unsigned long flags,
+ char *var_name, unsigned int level)
+{
+ struct hist_field *operand1, *expr = NULL;
+ unsigned long operand_flags;
+ int ret = 0;
+ char *s;
+
+ /* we support only -(xxx) i.e. explicit parens required */
+
+ if (level > 3) {
+ hist_err("Too many subexpressions (3 max): ", str);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ str++; /* skip leading '-' */
+
+ s = strchr(str, '(');
+ if (s)
+ str++;
+ else {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ s = strrchr(str, ')');
+ if (s)
+ *s = '\0';
+ else {
+ ret = -EINVAL; /* no closing ')' */
+ goto free;
+ }
+
+ flags |= HIST_FIELD_FL_EXPR;
+ expr = create_hist_field(hist_data, NULL, flags, var_name);
+ if (!expr) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ operand_flags = 0;
+ operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+ if (IS_ERR(operand1)) {
+ ret = PTR_ERR(operand1);
+ goto free;
+ }
+
+ expr->flags |= operand1->flags &
+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
+ expr->fn = hist_field_unary_minus;
+ expr->operands[0] = operand1;
+ expr->operator = FIELD_OP_UNARY_MINUS;
+ expr->name = expr_str(expr, 0);
+ expr->type = kstrdup(operand1->type, GFP_KERNEL);
+ if (!expr->type) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ return expr;
+ free:
+ destroy_hist_field(expr, 0);
+ return ERR_PTR(ret);
+}
+
+static int check_expr_operands(struct hist_field *operand1,
+ struct hist_field *operand2)
+{
+ unsigned long operand1_flags = operand1->flags;
+ unsigned long operand2_flags = operand2->flags;
+
+ if ((operand1_flags & HIST_FIELD_FL_VAR_REF) ||
+ (operand1_flags & HIST_FIELD_FL_ALIAS)) {
+ struct hist_field *var;
+
+ var = find_var_field(operand1->var.hist_data, operand1->name);
+ if (!var)
+ return -EINVAL;
+ operand1_flags = var->flags;
+ }
+
+ if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
+ (operand2_flags & HIST_FIELD_FL_ALIAS)) {
+ struct hist_field *var;
+
+ var = find_var_field(operand2->var.hist_data, operand2->name);
+ if (!var)
+ return -EINVAL;
+ operand2_flags = var->flags;
+ }
+
+ if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
+ (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
+ hist_err("Timestamp units in expression don't match", NULL);
return -EINVAL;
+ }
return 0;
}
-static int create_val_field(struct hist_trigger_data *hist_data,
- unsigned int val_idx,
- struct trace_event_file *file,
- char *field_str)
+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ char *str, unsigned long flags,
+ char *var_name, unsigned int level)
{
- struct ftrace_event_field *field = NULL;
- unsigned long flags = 0;
- char *field_name;
+ struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
+ unsigned long operand_flags;
+ int field_op, ret = -EINVAL;
+ char *sep, *operand1_str;
+
+ if (level > 3) {
+ hist_err("Too many subexpressions (3 max): ", str);
+ return ERR_PTR(-EINVAL);
+ }
+
+ field_op = contains_operator(str);
+
+ if (field_op == FIELD_OP_NONE)
+ return parse_atom(hist_data, file, str, &flags, var_name);
+
+ if (field_op == FIELD_OP_UNARY_MINUS)
+ return parse_unary(hist_data, file, str, flags, var_name, ++level);
+
+ switch (field_op) {
+ case FIELD_OP_MINUS:
+ sep = "-";
+ break;
+ case FIELD_OP_PLUS:
+ sep = "+";
+ break;
+ default:
+ goto free;
+ }
+
+ operand1_str = strsep(&str, sep);
+ if (!operand1_str || !str)
+ goto free;
+
+ operand_flags = 0;
+ operand1 = parse_atom(hist_data, file, operand1_str,
+ &operand_flags, NULL);
+ if (IS_ERR(operand1)) {
+ ret = PTR_ERR(operand1);
+ operand1 = NULL;
+ goto free;
+ }
+
+ /* rest of string could be another expression e.g. b+c in a+b+c */
+ operand_flags = 0;
+ operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+ if (IS_ERR(operand2)) {
+ ret = PTR_ERR(operand2);
+ operand2 = NULL;
+ goto free;
+ }
+
+ ret = check_expr_operands(operand1, operand2);
+ if (ret)
+ goto free;
+
+ flags |= HIST_FIELD_FL_EXPR;
+
+ flags |= operand1->flags &
+ (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
+
+ expr = create_hist_field(hist_data, NULL, flags, var_name);
+ if (!expr) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ operand1->read_once = true;
+ operand2->read_once = true;
+
+ expr->operands[0] = operand1;
+ expr->operands[1] = operand2;
+ expr->operator = field_op;
+ expr->name = expr_str(expr, 0);
+ expr->type = kstrdup(operand1->type, GFP_KERNEL);
+ if (!expr->type) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ switch (field_op) {
+ case FIELD_OP_MINUS:
+ expr->fn = hist_field_minus;
+ break;
+ case FIELD_OP_PLUS:
+ expr->fn = hist_field_plus;
+ break;
+ default:
+ ret = -EINVAL;
+ goto free;
+ }
+
+ return expr;
+ free:
+ destroy_hist_field(operand1, 0);
+ destroy_hist_field(operand2, 0);
+ destroy_hist_field(expr, 0);
+
+ return ERR_PTR(ret);
+}
+
+static char *find_trigger_filter(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file)
+{
+ struct event_trigger_data *test;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (test->private_data == hist_data)
+ return test->filter_str;
+ }
+ }
+
+ return NULL;
+}
+
+static struct event_command trigger_hist_cmd;
+static int event_hist_trigger_func(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param);
+
+static bool compatible_keys(struct hist_trigger_data *target_hist_data,
+ struct hist_trigger_data *hist_data,
+ unsigned int n_keys)
+{
+ struct hist_field *target_hist_field, *hist_field;
+ unsigned int n, i, j;
+
+ if (hist_data->n_fields - hist_data->n_vals != n_keys)
+ return false;
+
+ i = hist_data->n_vals;
+ j = target_hist_data->n_vals;
+
+ for (n = 0; n < n_keys; n++) {
+ hist_field = hist_data->fields[i + n];
+ target_hist_field = target_hist_data->fields[j + n];
+
+ if (strcmp(hist_field->type, target_hist_field->type) != 0)
+ return false;
+ if (hist_field->size != target_hist_field->size)
+ return false;
+ if (hist_field->is_signed != target_hist_field->is_signed)
+ return false;
+ }
+
+ return true;
+}
+
+static struct hist_trigger_data *
+find_compatible_hist(struct hist_trigger_data *target_hist_data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data;
+ struct event_trigger_data *test;
+ unsigned int n_keys;
+
+ n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = test->private_data;
+
+ if (compatible_keys(target_hist_data, hist_data, n_keys))
+ return hist_data;
+ }
+ }
+
+ return NULL;
+}
+
+static struct trace_event_file *event_file(struct trace_array *tr,
+ char *system, char *event_name)
+{
+ struct trace_event_file *file;
+
+ file = __find_event_file(tr, system, event_name);
+ if (!file)
+ return ERR_PTR(-EINVAL);
+
+ return file;
+}
+
+static struct hist_field *
+find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
+ char *system, char *event_name, char *field_name)
+{
+ struct hist_field *event_var;
+ char *synthetic_name;
+
+ synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
+ if (!synthetic_name)
+ return ERR_PTR(-ENOMEM);
+
+ strcpy(synthetic_name, "synthetic_");
+ strcat(synthetic_name, field_name);
+
+ event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
+
+ kfree(synthetic_name);
+
+ return event_var;
+}
+
+/**
+ * create_field_var_hist - Automatically create a histogram and var for a field
+ * @target_hist_data: The target hist trigger
+ * @subsys_name: Optional subsystem name
+ * @event_name: Optional event name
+ * @field_name: The name of the field (and the resulting variable)
+ *
+ * Hist trigger actions fetch data from variables, not directly from
+ * events. However, for convenience, users are allowed to directly
+ * specify an event field in an action, which will be automatically
+ * converted into a variable on their behalf.
+
+ * If a user specifies a field on an event that isn't the event the
+ * histogram currently being defined (the target event histogram), the
+ * only way that can be accomplished is if a new hist trigger is
+ * created and the field variable defined on that.
+ *
+ * This function creates a new histogram compatible with the target
+ * event (meaning a histogram with the same key as the target
+ * histogram), and creates a variable for the specified field, but
+ * with 'synthetic_' prepended to the variable name in order to avoid
+ * collision with normal field variables.
+ *
+ * Return: The variable created for the field.
+ */
+static struct hist_field *
+create_field_var_hist(struct hist_trigger_data *target_hist_data,
+ char *subsys_name, char *event_name, char *field_name)
+{
+ struct trace_array *tr = target_hist_data->event_file->tr;
+ struct hist_field *event_var = ERR_PTR(-EINVAL);
+ struct hist_trigger_data *hist_data;
+ unsigned int i, n, first = true;
+ struct field_var_hist *var_hist;
+ struct trace_event_file *file;
+ struct hist_field *key_field;
+ char *saved_filter;
+ char *cmd;
+ int ret;
+
+ if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
+ hist_err_event("onmatch: Too many field variables defined: ",
+ subsys_name, event_name, field_name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ file = event_file(tr, subsys_name, event_name);
+
+ if (IS_ERR(file)) {
+ hist_err_event("onmatch: Event file not found: ",
+ subsys_name, event_name, field_name);
+ ret = PTR_ERR(file);
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Look for a histogram compatible with target. We'll use the
+ * found histogram specification to create a new matching
+ * histogram with our variable on it. target_hist_data is not
+ * yet a registered histogram so we can't use that.
+ */
+ hist_data = find_compatible_hist(target_hist_data, file);
+ if (!hist_data) {
+ hist_err_event("onmatch: Matching event histogram not found: ",
+ subsys_name, event_name, field_name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* See if a synthetic field variable has already been created */
+ event_var = find_synthetic_field_var(target_hist_data, subsys_name,
+ event_name, field_name);
+ if (!IS_ERR_OR_NULL(event_var))
+ return event_var;
+
+ var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
+ if (!var_hist)
+ return ERR_PTR(-ENOMEM);
+
+ cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
+ if (!cmd) {
+ kfree(var_hist);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Use the same keys as the compatible histogram */
+ strcat(cmd, "keys=");
+
+ for_each_hist_key_field(i, hist_data) {
+ key_field = hist_data->fields[i];
+ if (!first)
+ strcat(cmd, ",");
+ strcat(cmd, key_field->field->name);
+ first = false;
+ }
+
+ /* Create the synthetic field variable specification */
+ strcat(cmd, ":synthetic_");
+ strcat(cmd, field_name);
+ strcat(cmd, "=");
+ strcat(cmd, field_name);
+
+ /* Use the same filter as the compatible histogram */
+ saved_filter = find_trigger_filter(hist_data, file);
+ if (saved_filter) {
+ strcat(cmd, " if ");
+ strcat(cmd, saved_filter);
+ }
+
+ var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
+ if (!var_hist->cmd) {
+ kfree(cmd);
+ kfree(var_hist);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Save the compatible histogram information */
+ var_hist->hist_data = hist_data;
+
+ /* Create the new histogram with our variable */
+ ret = event_hist_trigger_func(&trigger_hist_cmd, file,
+ "", "hist", cmd);
+ if (ret) {
+ kfree(cmd);
+ kfree(var_hist->cmd);
+ kfree(var_hist);
+ hist_err_event("onmatch: Couldn't create histogram for field: ",
+ subsys_name, event_name, field_name);
+ return ERR_PTR(ret);
+ }
+
+ kfree(cmd);
+
+ /* If we can't find the variable, something went wrong */
+ event_var = find_synthetic_field_var(target_hist_data, subsys_name,
+ event_name, field_name);
+ if (IS_ERR_OR_NULL(event_var)) {
+ kfree(var_hist->cmd);
+ kfree(var_hist);
+ hist_err_event("onmatch: Couldn't find synthetic variable: ",
+ subsys_name, event_name, field_name);
+ return ERR_PTR(-EINVAL);
+ }
+
+ n = target_hist_data->n_field_var_hists;
+ target_hist_data->field_var_hists[n] = var_hist;
+ target_hist_data->n_field_var_hists++;
+
+ return event_var;
+}
+
+static struct hist_field *
+find_target_event_var(struct hist_trigger_data *hist_data,
+ char *subsys_name, char *event_name, char *var_name)
+{
+ struct trace_event_file *file = hist_data->event_file;
+ struct hist_field *hist_field = NULL;
+
+ if (subsys_name) {
+ struct trace_event_call *call;
+
+ if (!event_name)
+ return NULL;
+
+ call = file->event_call;
+
+ if (strcmp(subsys_name, call->class->system) != 0)
+ return NULL;
+
+ if (strcmp(event_name, trace_event_name(call)) != 0)
+ return NULL;
+ }
+
+ hist_field = find_var_field(hist_data, var_name);
+
+ return hist_field;
+}
+
+static inline void __update_field_vars(struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *rec,
+ struct field_var **field_vars,
+ unsigned int n_field_vars,
+ unsigned int field_var_str_start)
+{
+ struct hist_elt_data *elt_data = elt->private_data;
+ unsigned int i, j, var_idx;
+ u64 var_val;
+
+ for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
+ struct field_var *field_var = field_vars[i];
+ struct hist_field *var = field_var->var;
+ struct hist_field *val = field_var->val;
+
+ var_val = val->fn(val, elt, rbe, rec);
+ var_idx = var->var.idx;
+
+ if (val->flags & HIST_FIELD_FL_STRING) {
+ char *str = elt_data->field_var_str[j++];
+ char *val_str = (char *)(uintptr_t)var_val;
+
+ strscpy(str, val_str, STR_VAR_LEN_MAX);
+ var_val = (u64)(uintptr_t)str;
+ }
+ tracing_map_set_var(elt, var_idx, var_val);
+ }
+}
+
+static void update_field_vars(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *rec)
+{
+ __update_field_vars(elt, rbe, rec, hist_data->field_vars,
+ hist_data->n_field_vars, 0);
+}
+
+static void update_max_vars(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct ring_buffer_event *rbe,
+ void *rec)
+{
+ __update_field_vars(elt, rbe, rec, hist_data->max_vars,
+ hist_data->n_max_vars, hist_data->n_field_var_str);
+}
+
+static struct hist_field *create_var(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ char *name, int size, const char *type)
+{
+ struct hist_field *var;
+ int idx;
+
+ if (find_var(hist_data, file, name) && !hist_data->remove) {
+ var = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
+ if (!var) {
+ var = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ idx = tracing_map_add_var(hist_data->map);
+ if (idx < 0) {
+ kfree(var);
+ var = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ var->flags = HIST_FIELD_FL_VAR;
+ var->var.idx = idx;
+ var->var.hist_data = var->hist_data = hist_data;
+ var->size = size;
+ var->var.name = kstrdup(name, GFP_KERNEL);
+ var->type = kstrdup(type, GFP_KERNEL);
+ if (!var->var.name || !var->type) {
+ kfree(var->var.name);
+ kfree(var->type);
+ kfree(var);
+ var = ERR_PTR(-ENOMEM);
+ }
+ out:
+ return var;
+}
+
+static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ char *field_name)
+{
+ struct hist_field *val = NULL, *var = NULL;
+ unsigned long flags = HIST_FIELD_FL_VAR;
+ struct field_var *field_var;
int ret = 0;
- if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
+ if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
+ hist_err("Too many field variables defined: ", field_name);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ val = parse_atom(hist_data, file, field_name, &flags, NULL);
+ if (IS_ERR(val)) {
+ hist_err("Couldn't parse field variable: ", field_name);
+ ret = PTR_ERR(val);
+ goto err;
+ }
+
+ var = create_var(hist_data, file, field_name, val->size, val->type);
+ if (IS_ERR(var)) {
+ hist_err("Couldn't create or find variable: ", field_name);
+ kfree(val);
+ ret = PTR_ERR(var);
+ goto err;
+ }
+
+ field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
+ if (!field_var) {
+ kfree(val);
+ kfree(var);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ field_var->var = var;
+ field_var->val = val;
+ out:
+ return field_var;
+ err:
+ field_var = ERR_PTR(ret);
+ goto out;
+}
+
+/**
+ * create_target_field_var - Automatically create a variable for a field
+ * @target_hist_data: The target hist trigger
+ * @subsys_name: Optional subsystem name
+ * @event_name: Optional event name
+ * @var_name: The name of the field (and the resulting variable)
+ *
+ * Hist trigger actions fetch data from variables, not directly from
+ * events. However, for convenience, users are allowed to directly
+ * specify an event field in an action, which will be automatically
+ * converted into a variable on their behalf.
+
+ * This function creates a field variable with the name var_name on
+ * the hist trigger currently being defined on the target event. If
+ * subsys_name and event_name are specified, this function simply
+ * verifies that they do in fact match the target event subsystem and
+ * event name.
+ *
+ * Return: The variable created for the field.
+ */
+static struct field_var *
+create_target_field_var(struct hist_trigger_data *target_hist_data,
+ char *subsys_name, char *event_name, char *var_name)
+{
+ struct trace_event_file *file = target_hist_data->event_file;
+
+ if (subsys_name) {
+ struct trace_event_call *call;
+
+ if (!event_name)
+ return NULL;
+
+ call = file->event_call;
+
+ if (strcmp(subsys_name, call->class->system) != 0)
+ return NULL;
+
+ if (strcmp(event_name, trace_event_name(call)) != 0)
+ return NULL;
+ }
+
+ return create_field_var(target_hist_data, file, var_name);
+}
+
+static void onmax_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct action_data *data)
+{
+ unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
+
+ seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
+
+ for (i = 0; i < hist_data->n_max_vars; i++) {
+ struct hist_field *save_val = hist_data->max_vars[i]->val;
+ struct hist_field *save_var = hist_data->max_vars[i]->var;
+ u64 val;
+
+ save_var_idx = save_var->var.idx;
+
+ val = tracing_map_read_var(elt, save_var_idx);
+
+ if (save_val->flags & HIST_FIELD_FL_STRING) {
+ seq_printf(m, " %s: %-32s", save_var->var.name,
+ (char *)(uintptr_t)(val));
+ } else
+ seq_printf(m, " %s: %10llu", save_var->var.name, val);
+ }
+}
+
+static void onmax_save(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe,
+ struct action_data *data, u64 *var_ref_vals)
+{
+ unsigned int max_idx = data->onmax.max_var->var.idx;
+ unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
+
+ u64 var_val, max_val;
+
+ var_val = var_ref_vals[max_var_ref_idx];
+ max_val = tracing_map_read_var(elt, max_idx);
+
+ if (var_val <= max_val)
+ return;
+
+ tracing_map_set_var(elt, max_idx, var_val);
+
+ update_max_vars(hist_data, elt, rbe, rec);
+}
+
+static void onmax_destroy(struct action_data *data)
+{
+ unsigned int i;
+
+ destroy_hist_field(data->onmax.max_var, 0);
+ destroy_hist_field(data->onmax.var, 0);
+
+ kfree(data->onmax.var_str);
+ kfree(data->onmax.fn_name);
+
+ for (i = 0; i < data->n_params; i++)
+ kfree(data->params[i]);
+
+ kfree(data);
+}
+
+static int onmax_create(struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ struct trace_event_file *file = hist_data->event_file;
+ struct hist_field *var_field, *ref_field, *max_var;
+ unsigned int var_ref_idx = hist_data->n_var_refs;
+ struct field_var *field_var;
+ char *onmax_var_str, *param;
+ unsigned long flags;
+ unsigned int i;
+ int ret = 0;
+
+ onmax_var_str = data->onmax.var_str;
+ if (onmax_var_str[0] != '$') {
+ hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
return -EINVAL;
+ }
+ onmax_var_str++;
- field_name = strsep(&field_str, ".");
- if (field_str) {
- if (strcmp(field_str, "hex") == 0)
- flags |= HIST_FIELD_FL_HEX;
- else {
+ var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
+ if (!var_field) {
+ hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
+ return -EINVAL;
+ }
+
+ flags = HIST_FIELD_FL_VAR_REF;
+ ref_field = create_hist_field(hist_data, NULL, flags, NULL);
+ if (!ref_field)
+ return -ENOMEM;
+
+ if (init_var_ref(ref_field, var_field, NULL, NULL)) {
+ destroy_hist_field(ref_field, 0);
+ ret = -ENOMEM;
+ goto out;
+ }
+ hist_data->var_refs[hist_data->n_var_refs] = ref_field;
+ ref_field->var_ref_idx = hist_data->n_var_refs++;
+ data->onmax.var = ref_field;
+
+ data->fn = onmax_save;
+ data->onmax.max_var_ref_idx = var_ref_idx;
+ max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
+ if (IS_ERR(max_var)) {
+ hist_err("onmax: Couldn't create onmax variable: ", "max");
+ ret = PTR_ERR(max_var);
+ goto out;
+ }
+ data->onmax.max_var = max_var;
+
+ for (i = 0; i < data->n_params; i++) {
+ param = kstrdup(data->params[i], GFP_KERNEL);
+ if (!param) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ field_var = create_target_field_var(hist_data, NULL, NULL, param);
+ if (IS_ERR(field_var)) {
+ hist_err("onmax: Couldn't create field variable: ", param);
+ ret = PTR_ERR(field_var);
+ kfree(param);
+ goto out;
+ }
+
+ hist_data->max_vars[hist_data->n_max_vars++] = field_var;
+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ hist_data->n_max_var_str++;
+
+ kfree(param);
+ }
+ out:
+ return ret;
+}
+
+static int parse_action_params(char *params, struct action_data *data)
+{
+ char *param, *saved_param;
+ int ret = 0;
+
+ while (params) {
+ if (data->n_params >= SYNTH_FIELDS_MAX)
+ goto out;
+
+ param = strsep(&params, ",");
+ if (!param) {
ret = -EINVAL;
goto out;
}
+
+ param = strstrip(param);
+ if (strlen(param) < 2) {
+ hist_err("Invalid action param: ", param);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ saved_param = kstrdup(param, GFP_KERNEL);
+ if (!saved_param) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ data->params[data->n_params++] = saved_param;
}
+ out:
+ return ret;
+}
- field = trace_find_event_field(file->event_call, field_name);
- if (!field || !field->size) {
+static struct action_data *onmax_parse(char *str)
+{
+ char *onmax_fn_name, *onmax_var_str;
+ struct action_data *data;
+ int ret = -EINVAL;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ onmax_var_str = strsep(&str, ")");
+ if (!onmax_var_str || !str) {
ret = -EINVAL;
- goto out;
+ goto free;
+ }
+
+ data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
+ if (!data->onmax.var_str) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ strsep(&str, ".");
+ if (!str)
+ goto free;
+
+ onmax_fn_name = strsep(&str, "(");
+ if (!onmax_fn_name || !str)
+ goto free;
+
+ if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
+ char *params = strsep(&str, ")");
+
+ if (!params) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ ret = parse_action_params(params, data);
+ if (ret)
+ goto free;
+ } else
+ goto free;
+
+ data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
+ if (!data->onmax.fn_name) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ out:
+ return data;
+ free:
+ onmax_destroy(data);
+ data = ERR_PTR(ret);
+ goto out;
+}
+
+static void onmatch_destroy(struct action_data *data)
+{
+ unsigned int i;
+
+ mutex_lock(&synth_event_mutex);
+
+ kfree(data->onmatch.match_event);
+ kfree(data->onmatch.match_event_system);
+ kfree(data->onmatch.synth_event_name);
+
+ for (i = 0; i < data->n_params; i++)
+ kfree(data->params[i]);
+
+ if (data->onmatch.synth_event)
+ data->onmatch.synth_event->ref--;
+
+ kfree(data);
+
+ mutex_unlock(&synth_event_mutex);
+}
+
+static void destroy_field_var(struct field_var *field_var)
+{
+ if (!field_var)
+ return;
+
+ destroy_hist_field(field_var->var, 0);
+ destroy_hist_field(field_var->val, 0);
+
+ kfree(field_var);
+}
+
+static void destroy_field_vars(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_field_vars; i++)
+ destroy_field_var(hist_data->field_vars[i]);
+}
+
+static void save_field_var(struct hist_trigger_data *hist_data,
+ struct field_var *field_var)
+{
+ hist_data->field_vars[hist_data->n_field_vars++] = field_var;
+
+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ hist_data->n_field_var_str++;
+}
+
+
+static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_synth_var_refs; i++)
+ destroy_hist_field(hist_data->synth_var_refs[i], 0);
+}
+
+static void save_synth_var_ref(struct hist_trigger_data *hist_data,
+ struct hist_field *var_ref)
+{
+ hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
+
+ hist_data->var_refs[hist_data->n_var_refs] = var_ref;
+ var_ref->var_ref_idx = hist_data->n_var_refs++;
+}
+
+static int check_synth_field(struct synth_event *event,
+ struct hist_field *hist_field,
+ unsigned int field_pos)
+{
+ struct synth_field *field;
+
+ if (field_pos >= event->n_fields)
+ return -EINVAL;
+
+ field = event->fields[field_pos];
+
+ if (strcmp(field->type, hist_field->type) != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct hist_field *
+onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
+ char *system, char *event, char *var)
+{
+ struct hist_field *hist_field;
+
+ var++; /* skip '$' */
+
+ hist_field = find_target_event_var(hist_data, system, event, var);
+ if (!hist_field) {
+ if (!system) {
+ system = data->onmatch.match_event_system;
+ event = data->onmatch.match_event;
+ }
+
+ hist_field = find_event_var(hist_data, system, event, var);
+ }
+
+ if (!hist_field)
+ hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
+
+ return hist_field;
+}
+
+static struct hist_field *
+onmatch_create_field_var(struct hist_trigger_data *hist_data,
+ struct action_data *data, char *system,
+ char *event, char *var)
+{
+ struct hist_field *hist_field = NULL;
+ struct field_var *field_var;
+
+ /*
+ * First try to create a field var on the target event (the
+ * currently being defined). This will create a variable for
+ * unqualified fields on the target event, or if qualified,
+ * target fields that have qualified names matching the target.
+ */
+ field_var = create_target_field_var(hist_data, system, event, var);
+
+ if (field_var && !IS_ERR(field_var)) {
+ save_field_var(hist_data, field_var);
+ hist_field = field_var->var;
+ } else {
+ field_var = NULL;
+ /*
+ * If no explicit system.event is specfied, default to
+ * looking for fields on the onmatch(system.event.xxx)
+ * event.
+ */
+ if (!system) {
+ system = data->onmatch.match_event_system;
+ event = data->onmatch.match_event;
+ }
+
+ /*
+ * At this point, we're looking at a field on another
+ * event. Because we can't modify a hist trigger on
+ * another event to add a variable for a field, we need
+ * to create a new trigger on that event and create the
+ * variable at the same time.
+ */
+ hist_field = create_field_var_hist(hist_data, system, event, var);
+ if (IS_ERR(hist_field))
+ goto free;
+ }
+ out:
+ return hist_field;
+ free:
+ destroy_field_var(field_var);
+ hist_field = NULL;
+ goto out;
+}
+
+static int onmatch_create(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file,
+ struct action_data *data)
+{
+ char *event_name, *param, *system = NULL;
+ struct hist_field *hist_field, *var_ref;
+ unsigned int i, var_ref_idx;
+ unsigned int field_pos = 0;
+ struct synth_event *event;
+ int ret = 0;
+
+ mutex_lock(&synth_event_mutex);
+ event = find_synth_event(data->onmatch.synth_event_name);
+ if (!event) {
+ hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
+ mutex_unlock(&synth_event_mutex);
+ return -EINVAL;
+ }
+ event->ref++;
+ mutex_unlock(&synth_event_mutex);
+
+ var_ref_idx = hist_data->n_var_refs;
+
+ for (i = 0; i < data->n_params; i++) {
+ char *p;
+
+ p = param = kstrdup(data->params[i], GFP_KERNEL);
+ if (!param) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ system = strsep(&param, ".");
+ if (!param) {
+ param = (char *)system;
+ system = event_name = NULL;
+ } else {
+ event_name = strsep(&param, ".");
+ if (!param) {
+ kfree(p);
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+
+ if (param[0] == '$')
+ hist_field = onmatch_find_var(hist_data, data, system,
+ event_name, param);
+ else
+ hist_field = onmatch_create_field_var(hist_data, data,
+ system,
+ event_name,
+ param);
+
+ if (!hist_field) {
+ kfree(p);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (check_synth_field(event, hist_field, field_pos) == 0) {
+ var_ref = create_var_ref(hist_field, system, event_name);
+ if (!var_ref) {
+ kfree(p);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ save_synth_var_ref(hist_data, var_ref);
+ field_pos++;
+ kfree(p);
+ continue;
+ }
+
+ hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
+ system, event_name, param);
+ kfree(p);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (field_pos != event->n_fields) {
+ hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ data->fn = action_trace;
+ data->onmatch.synth_event = event;
+ data->onmatch.var_ref_idx = var_ref_idx;
+ out:
+ return ret;
+ err:
+ mutex_lock(&synth_event_mutex);
+ event->ref--;
+ mutex_unlock(&synth_event_mutex);
+
+ goto out;
+}
+
+static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
+{
+ char *match_event, *match_event_system;
+ char *synth_event_name, *params;
+ struct action_data *data;
+ int ret = -EINVAL;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ match_event = strsep(&str, ")");
+ if (!match_event || !str) {
+ hist_err("onmatch: Missing closing paren: ", match_event);
+ goto free;
+ }
+
+ match_event_system = strsep(&match_event, ".");
+ if (!match_event) {
+ hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
+ goto free;
+ }
+
+ if (IS_ERR(event_file(tr, match_event_system, match_event))) {
+ hist_err_event("onmatch: Invalid subsystem or event name: ",
+ match_event_system, match_event, NULL);
+ goto free;
+ }
+
+ data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
+ if (!data->onmatch.match_event) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
+ if (!data->onmatch.match_event_system) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ strsep(&str, ".");
+ if (!str) {
+ hist_err("onmatch: Missing . after onmatch(): ", str);
+ goto free;
+ }
+
+ synth_event_name = strsep(&str, "(");
+ if (!synth_event_name || !str) {
+ hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
+ goto free;
}
- hist_data->fields[val_idx] = create_hist_field(field, flags);
- if (!hist_data->fields[val_idx]) {
+ data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
+ if (!data->onmatch.synth_event_name) {
ret = -ENOMEM;
+ goto free;
+ }
+
+ params = strsep(&str, ")");
+ if (!params || !str || (str && strlen(str))) {
+ hist_err("onmatch: Missing closing paramlist paren: ", params);
+ goto free;
+ }
+
+ ret = parse_action_params(params, data);
+ if (ret)
+ goto free;
+ out:
+ return data;
+ free:
+ onmatch_destroy(data);
+ data = ERR_PTR(ret);
+ goto out;
+}
+
+static int create_hitcount_val(struct hist_trigger_data *hist_data)
+{
+ hist_data->fields[HITCOUNT_IDX] =
+ create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL);
+ if (!hist_data->fields[HITCOUNT_IDX])
+ return -ENOMEM;
+
+ hist_data->n_vals++;
+ hist_data->n_fields++;
+
+ if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __create_val_field(struct hist_trigger_data *hist_data,
+ unsigned int val_idx,
+ struct trace_event_file *file,
+ char *var_name, char *field_str,
+ unsigned long flags)
+{
+ struct hist_field *hist_field;
+ int ret = 0;
+
+ hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
+ if (IS_ERR(hist_field)) {
+ ret = PTR_ERR(hist_field);
goto out;
}
+ hist_data->fields[val_idx] = hist_field;
+
++hist_data->n_vals;
+ ++hist_data->n_fields;
- if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+ if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
ret = -EINVAL;
out:
return ret;
}
+static int create_val_field(struct hist_trigger_data *hist_data,
+ unsigned int val_idx,
+ struct trace_event_file *file,
+ char *field_str)
+{
+ if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
+ return -EINVAL;
+
+ return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
+}
+
+static int create_var_field(struct hist_trigger_data *hist_data,
+ unsigned int val_idx,
+ struct trace_event_file *file,
+ char *var_name, char *expr_str)
+{
+ unsigned long flags = 0;
+
+ if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
+ return -EINVAL;
+
+ if (find_var(hist_data, file, var_name) && !hist_data->remove) {
+ hist_err("Variable already defined: ", var_name);
+ return -EINVAL;
+ }
+
+ flags |= HIST_FIELD_FL_VAR;
+ hist_data->n_vars++;
+ if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
+ return -EINVAL;
+
+ return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+}
+
static int create_val_fields(struct hist_trigger_data *hist_data,
struct trace_event_file *file)
{
char *fields_str, *field_str;
- unsigned int i, j;
+ unsigned int i, j = 1;
int ret;
ret = create_hitcount_val(hist_data);
@@ -533,12 +3912,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
field_str = strsep(&fields_str, ",");
if (!field_str)
break;
+
if (strcmp(field_str, "hitcount") == 0)
continue;
+
ret = create_val_field(hist_data, j++, file, field_str);
if (ret)
goto out;
}
+
if (fields_str && (strcmp(fields_str, "hitcount") != 0))
ret = -EINVAL;
out:
@@ -551,12 +3933,13 @@ static int create_key_field(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *field_str)
{
- struct ftrace_event_field *field = NULL;
+ struct hist_field *hist_field = NULL;
+
unsigned long flags = 0;
unsigned int key_size;
int ret = 0;
- if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
+ if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
return -EINVAL;
flags |= HIST_FIELD_FL_KEY;
@@ -564,57 +3947,40 @@ static int create_key_field(struct hist_trigger_data *hist_data,
if (strcmp(field_str, "stacktrace") == 0) {
flags |= HIST_FIELD_FL_STACKTRACE;
key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
+ hist_field = create_hist_field(hist_data, NULL, flags, NULL);
} else {
- char *field_name = strsep(&field_str, ".");
-
- if (field_str) {
- if (strcmp(field_str, "hex") == 0)
- flags |= HIST_FIELD_FL_HEX;
- else if (strcmp(field_str, "sym") == 0)
- flags |= HIST_FIELD_FL_SYM;
- else if (strcmp(field_str, "sym-offset") == 0)
- flags |= HIST_FIELD_FL_SYM_OFFSET;
- else if ((strcmp(field_str, "execname") == 0) &&
- (strcmp(field_name, "common_pid") == 0))
- flags |= HIST_FIELD_FL_EXECNAME;
- else if (strcmp(field_str, "syscall") == 0)
- flags |= HIST_FIELD_FL_SYSCALL;
- else if (strcmp(field_str, "log2") == 0)
- flags |= HIST_FIELD_FL_LOG2;
- else {
- ret = -EINVAL;
- goto out;
- }
+ hist_field = parse_expr(hist_data, file, field_str, flags,
+ NULL, 0);
+ if (IS_ERR(hist_field)) {
+ ret = PTR_ERR(hist_field);
+ goto out;
}
- field = trace_find_event_field(file->event_call, field_name);
- if (!field || !field->size) {
+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
+ hist_err("Using variable references as keys not supported: ", field_str);
+ destroy_hist_field(hist_field, 0);
ret = -EINVAL;
goto out;
}
- if (is_string_field(field))
- key_size = MAX_FILTER_STR_VAL;
- else
- key_size = field->size;
+ key_size = hist_field->size;
}
- hist_data->fields[key_idx] = create_hist_field(field, flags);
- if (!hist_data->fields[key_idx]) {
- ret = -ENOMEM;
- goto out;
- }
+ hist_data->fields[key_idx] = hist_field;
key_size = ALIGN(key_size, sizeof(u64));
hist_data->fields[key_idx]->size = key_size;
hist_data->fields[key_idx]->offset = key_offset;
+
hist_data->key_size += key_size;
+
if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
ret = -EINVAL;
goto out;
}
hist_data->n_keys++;
+ hist_data->n_fields++;
if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
return -EINVAL;
@@ -658,21 +4024,113 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
return ret;
}
+static int create_var_fields(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file)
+{
+ unsigned int i, j = hist_data->n_vals;
+ int ret = 0;
+
+ unsigned int n_vars = hist_data->attrs->var_defs.n_vars;
+
+ for (i = 0; i < n_vars; i++) {
+ char *var_name = hist_data->attrs->var_defs.name[i];
+ char *expr = hist_data->attrs->var_defs.expr[i];
+
+ ret = create_var_field(hist_data, j++, file, var_name, expr);
+ if (ret)
+ goto out;
+ }
+ out:
+ return ret;
+}
+
+static void free_var_defs(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
+ kfree(hist_data->attrs->var_defs.name[i]);
+ kfree(hist_data->attrs->var_defs.expr[i]);
+ }
+
+ hist_data->attrs->var_defs.n_vars = 0;
+}
+
+static int parse_var_defs(struct hist_trigger_data *hist_data)
+{
+ char *s, *str, *var_name, *field_str;
+ unsigned int i, j, n_vars = 0;
+ int ret = 0;
+
+ for (i = 0; i < hist_data->attrs->n_assignments; i++) {
+ str = hist_data->attrs->assignment_str[i];
+ for (j = 0; j < TRACING_MAP_VARS_MAX; j++) {
+ field_str = strsep(&str, ",");
+ if (!field_str)
+ break;
+
+ var_name = strsep(&field_str, "=");
+ if (!var_name || !field_str) {
+ hist_err("Malformed assignment: ", var_name);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ if (n_vars == TRACING_MAP_VARS_MAX) {
+ hist_err("Too many variables defined: ", var_name);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ s = kstrdup(var_name, GFP_KERNEL);
+ if (!s) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ hist_data->attrs->var_defs.name[n_vars] = s;
+
+ s = kstrdup(field_str, GFP_KERNEL);
+ if (!s) {
+ kfree(hist_data->attrs->var_defs.name[n_vars]);
+ ret = -ENOMEM;
+ goto free;
+ }
+ hist_data->attrs->var_defs.expr[n_vars++] = s;
+
+ hist_data->attrs->var_defs.n_vars = n_vars;
+ }
+ }
+
+ return ret;
+ free:
+ free_var_defs(hist_data);
+
+ return ret;
+}
+
static int create_hist_fields(struct hist_trigger_data *hist_data,
struct trace_event_file *file)
{
int ret;
+ ret = parse_var_defs(hist_data);
+ if (ret)
+ goto out;
+
ret = create_val_fields(hist_data, file);
if (ret)
goto out;
- ret = create_key_fields(hist_data, file);
+ ret = create_var_fields(hist_data, file);
if (ret)
goto out;
- hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
+ ret = create_key_fields(hist_data, file);
+ if (ret)
+ goto out;
out:
+ free_var_defs(hist_data);
+
return ret;
}
@@ -695,7 +4153,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
char *fields_str = hist_data->attrs->sort_key_str;
struct tracing_map_sort_key *sort_key;
int descending, ret = 0;
- unsigned int i, j;
+ unsigned int i, j, k;
hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
@@ -743,12 +4201,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
continue;
}
- for (j = 1; j < hist_data->n_fields; j++) {
+ for (j = 1, k = 1; j < hist_data->n_fields; j++) {
+ unsigned int idx;
+
hist_field = hist_data->fields[j];
+ if (hist_field->flags & HIST_FIELD_FL_VAR)
+ continue;
+
+ idx = k++;
+
test_name = hist_field_name(hist_field, 0);
if (strcmp(field_name, test_name) == 0) {
- sort_key->field_idx = j;
+ sort_key->field_idx = idx;
descending = is_descending(field_str);
if (descending < 0) {
ret = descending;
@@ -763,16 +4228,230 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
break;
}
}
+
hist_data->n_sort_keys = i;
out:
return ret;
}
+static void destroy_actions(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+
+ if (data->fn == action_trace)
+ onmatch_destroy(data);
+ else if (data->fn == onmax_save)
+ onmax_destroy(data);
+ else
+ kfree(data);
+ }
+}
+
+static int parse_actions(struct hist_trigger_data *hist_data)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct action_data *data;
+ unsigned int i;
+ int ret = 0;
+ char *str;
+
+ for (i = 0; i < hist_data->attrs->n_actions; i++) {
+ str = hist_data->attrs->action_str[i];
+
+ if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
+ char *action_str = str + strlen("onmatch(");
+
+ data = onmatch_parse(tr, action_str);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ break;
+ }
+ data->fn = action_trace;
+ } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
+ char *action_str = str + strlen("onmax(");
+
+ data = onmax_parse(action_str);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ break;
+ }
+ data->fn = onmax_save;
+ } else {
+ ret = -EINVAL;
+ break;
+ }
+
+ hist_data->actions[hist_data->n_actions++] = data;
+ }
+
+ return ret;
+}
+
+static int create_actions(struct hist_trigger_data *hist_data,
+ struct trace_event_file *file)
+{
+ struct action_data *data;
+ unsigned int i;
+ int ret = 0;
+
+ for (i = 0; i < hist_data->attrs->n_actions; i++) {
+ data = hist_data->actions[i];
+
+ if (data->fn == action_trace) {
+ ret = onmatch_create(hist_data, file, data);
+ if (ret)
+ return ret;
+ } else if (data->fn == onmax_save) {
+ ret = onmax_create(hist_data, data);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+static void print_actions(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+
+ if (data->fn == onmax_save)
+ onmax_print(m, hist_data, elt, data);
+ }
+}
+
+static void print_onmax_spec(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ unsigned int i;
+
+ seq_puts(m, ":onmax(");
+ seq_printf(m, "%s", data->onmax.var_str);
+ seq_printf(m, ").%s(", data->onmax.fn_name);
+
+ for (i = 0; i < hist_data->n_max_vars; i++) {
+ seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
+ if (i < hist_data->n_max_vars - 1)
+ seq_puts(m, ",");
+ }
+ seq_puts(m, ")");
+}
+
+static void print_onmatch_spec(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ unsigned int i;
+
+ seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
+ data->onmatch.match_event);
+
+ seq_printf(m, "%s(", data->onmatch.synth_event->name);
+
+ for (i = 0; i < data->n_params; i++) {
+ if (i)
+ seq_puts(m, ",");
+ seq_printf(m, "%s", data->params[i]);
+ }
+
+ seq_puts(m, ")");
+}
+
+static bool actions_match(struct hist_trigger_data *hist_data,
+ struct hist_trigger_data *hist_data_test)
+{
+ unsigned int i, j;
+
+ if (hist_data->n_actions != hist_data_test->n_actions)
+ return false;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+ struct action_data *data_test = hist_data_test->actions[i];
+
+ if (data->fn != data_test->fn)
+ return false;
+
+ if (data->n_params != data_test->n_params)
+ return false;
+
+ for (j = 0; j < data->n_params; j++) {
+ if (strcmp(data->params[j], data_test->params[j]) != 0)
+ return false;
+ }
+
+ if (data->fn == action_trace) {
+ if (strcmp(data->onmatch.synth_event_name,
+ data_test->onmatch.synth_event_name) != 0)
+ return false;
+ if (strcmp(data->onmatch.match_event_system,
+ data_test->onmatch.match_event_system) != 0)
+ return false;
+ if (strcmp(data->onmatch.match_event,
+ data_test->onmatch.match_event) != 0)
+ return false;
+ } else if (data->fn == onmax_save) {
+ if (strcmp(data->onmax.var_str,
+ data_test->onmax.var_str) != 0)
+ return false;
+ if (strcmp(data->onmax.fn_name,
+ data_test->onmax.fn_name) != 0)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+
+static void print_actions_spec(struct seq_file *m,
+ struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+
+ if (data->fn == action_trace)
+ print_onmatch_spec(m, hist_data, data);
+ else if (data->fn == onmax_save)
+ print_onmax_spec(m, hist_data, data);
+ }
+}
+
+static void destroy_field_var_hists(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_field_var_hists; i++) {
+ kfree(hist_data->field_var_hists[i]->cmd);
+ kfree(hist_data->field_var_hists[i]);
+ }
+}
+
static void destroy_hist_data(struct hist_trigger_data *hist_data)
{
+ if (!hist_data)
+ return;
+
destroy_hist_trigger_attrs(hist_data->attrs);
destroy_hist_fields(hist_data);
tracing_map_destroy(hist_data->map);
+
+ destroy_actions(hist_data);
+ destroy_field_vars(hist_data);
+ destroy_field_var_hists(hist_data);
+ destroy_synth_var_refs(hist_data);
+
kfree(hist_data);
}
@@ -781,7 +4460,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
struct tracing_map *map = hist_data->map;
struct ftrace_event_field *field;
struct hist_field *hist_field;
- int i, idx;
+ int i, idx = 0;
for_each_hist_field(i, hist_data) {
hist_field = hist_data->fields[i];
@@ -792,6 +4471,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
cmp_fn = tracing_map_cmp_none;
+ else if (!field)
+ cmp_fn = tracing_map_cmp_num(hist_field->size,
+ hist_field->is_signed);
else if (is_string_field(field))
cmp_fn = tracing_map_cmp_string;
else
@@ -800,36 +4482,29 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
idx = tracing_map_add_key_field(map,
hist_field->offset,
cmp_fn);
-
- } else
+ } else if (!(hist_field->flags & HIST_FIELD_FL_VAR))
idx = tracing_map_add_sum_field(map);
if (idx < 0)
return idx;
- }
-
- return 0;
-}
-
-static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
-{
- struct hist_field *key_field;
- unsigned int i;
-
- for_each_hist_key_field(i, hist_data) {
- key_field = hist_data->fields[i];
- if (key_field->flags & HIST_FIELD_FL_EXECNAME)
- return true;
+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
+ idx = tracing_map_add_var(map);
+ if (idx < 0)
+ return idx;
+ hist_field->var.idx = idx;
+ hist_field->var.hist_data = hist_data;
+ }
}
- return false;
+ return 0;
}
static struct hist_trigger_data *
create_hist_data(unsigned int map_bits,
struct hist_trigger_attrs *attrs,
- struct trace_event_file *file)
+ struct trace_event_file *file,
+ bool remove)
{
const struct tracing_map_ops *map_ops = NULL;
struct hist_trigger_data *hist_data;
@@ -840,6 +4515,12 @@ create_hist_data(unsigned int map_bits,
return ERR_PTR(-ENOMEM);
hist_data->attrs = attrs;
+ hist_data->remove = remove;
+ hist_data->event_file = file;
+
+ ret = parse_actions(hist_data);
+ if (ret)
+ goto free;
ret = create_hist_fields(hist_data, file);
if (ret)
@@ -849,8 +4530,7 @@ create_hist_data(unsigned int map_bits,
if (ret)
goto free;
- if (need_tracing_map_ops(hist_data))
- map_ops = &hist_trigger_elt_comm_ops;
+ map_ops = &hist_trigger_elt_data_ops;
hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
map_ops, hist_data);
@@ -863,12 +4543,6 @@ create_hist_data(unsigned int map_bits,
ret = create_tracing_map_fields(hist_data);
if (ret)
goto free;
-
- ret = tracing_map_init(hist_data->map);
- if (ret)
- goto free;
-
- hist_data->event_file = file;
out:
return hist_data;
free:
@@ -882,18 +4556,39 @@ create_hist_data(unsigned int map_bits,
}
static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt,
- void *rec)
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe,
+ u64 *var_ref_vals)
{
+ struct hist_elt_data *elt_data;
struct hist_field *hist_field;
- unsigned int i;
+ unsigned int i, var_idx;
u64 hist_val;
+ elt_data = elt->private_data;
+ elt_data->var_ref_vals = var_ref_vals;
+
for_each_hist_val_field(i, hist_data) {
hist_field = hist_data->fields[i];
- hist_val = hist_field->fn(hist_field, rec);
+ hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
+ var_idx = hist_field->var.idx;
+ tracing_map_set_var(elt, var_idx, hist_val);
+ continue;
+ }
tracing_map_update_sum(elt, i, hist_val);
}
+
+ for_each_hist_key_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
+ if (hist_field->flags & HIST_FIELD_FL_VAR) {
+ hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+ var_idx = hist_field->var.idx;
+ tracing_map_set_var(elt, var_idx, hist_val);
+ }
+ }
+
+ update_field_vars(hist_data, elt, rbe, rec);
}
static inline void add_to_key(char *compound_key, void *key,
@@ -920,15 +4615,31 @@ static inline void add_to_key(char *compound_key, void *key,
memcpy(compound_key + key_field->offset, key, size);
}
-static void event_hist_trigger(struct event_trigger_data *data, void *rec)
+static void
+hist_trigger_actions(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, u64 *var_ref_vals)
+{
+ struct action_data *data;
+ unsigned int i;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ data = hist_data->actions[i];
+ data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
+ }
+}
+
+static void event_hist_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *rbe)
{
struct hist_trigger_data *hist_data = data->private_data;
bool use_compound_key = (hist_data->n_keys > 1);
unsigned long entries[HIST_STACKTRACE_DEPTH];
+ u64 var_ref_vals[TRACING_MAP_VARS_MAX];
char compound_key[HIST_KEY_SIZE_MAX];
+ struct tracing_map_elt *elt = NULL;
struct stack_trace stacktrace;
struct hist_field *key_field;
- struct tracing_map_elt *elt;
u64 field_contents;
void *key = NULL;
unsigned int i;
@@ -949,7 +4660,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
key = entries;
} else {
- field_contents = key_field->fn(key_field, rec);
+ field_contents = key_field->fn(key_field, elt, rbe, rec);
if (key_field->flags & HIST_FIELD_FL_STRING) {
key = (void *)(unsigned long)field_contents;
use_compound_key = true;
@@ -964,9 +4675,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
if (use_compound_key)
key = compound_key;
+ if (hist_data->n_var_refs &&
+ !resolve_var_refs(hist_data, key, var_ref_vals, false))
+ return;
+
elt = tracing_map_insert(hist_data->map, key);
- if (elt)
- hist_trigger_elt_update(hist_data, elt, rec);
+ if (!elt)
+ return;
+
+ hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
+
+ if (resolve_var_refs(hist_data, key, var_ref_vals, true))
+ hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -1023,7 +4743,13 @@ hist_trigger_entry_print(struct seq_file *m,
seq_printf(m, "%s: [%llx] %-55s", field_name,
uval, str);
} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
- char *comm = elt->private_data;
+ struct hist_elt_data *elt_data = elt->private_data;
+ char *comm;
+
+ if (WARN_ON_ONCE(!elt_data))
+ return;
+
+ comm = elt_data->comm;
uval = *(u64 *)(key + key_field->offset);
seq_printf(m, "%s: %-16s[%10llu]", field_name,
@@ -1067,6 +4793,10 @@ hist_trigger_entry_print(struct seq_file *m,
for (i = 1; i < hist_data->n_vals; i++) {
field_name = hist_field_name(hist_data->fields[i], 0);
+ if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
+ hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
+ continue;
+
if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
seq_printf(m, " %s: %10llx", field_name,
tracing_map_read_sum(elt, i));
@@ -1076,6 +4806,8 @@ hist_trigger_entry_print(struct seq_file *m,
}
}
+ print_actions(m, hist_data, elt);
+
seq_puts(m, "\n");
}
@@ -1144,6 +4876,11 @@ static int hist_show(struct seq_file *m, void *v)
hist_trigger_show(m, data, n++);
}
+ if (have_hist_err()) {
+ seq_printf(m, "\nERROR: %s\n", hist_err_str);
+ seq_printf(m, " Last command: %s\n", last_hist_cmd);
+ }
+
out_unlock:
mutex_unlock(&event_mutex);
@@ -1162,36 +4899,31 @@ const struct file_operations event_hist_fops = {
.release = single_release,
};
-static const char *get_hist_field_flags(struct hist_field *hist_field)
-{
- const char *flags_str = NULL;
-
- if (hist_field->flags & HIST_FIELD_FL_HEX)
- flags_str = "hex";
- else if (hist_field->flags & HIST_FIELD_FL_SYM)
- flags_str = "sym";
- else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
- flags_str = "sym-offset";
- else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
- flags_str = "execname";
- else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
- flags_str = "syscall";
- else if (hist_field->flags & HIST_FIELD_FL_LOG2)
- flags_str = "log2";
-
- return flags_str;
-}
-
static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
{
const char *field_name = hist_field_name(hist_field, 0);
- seq_printf(m, "%s", field_name);
+ if (hist_field->var.name)
+ seq_printf(m, "%s=", hist_field->var.name);
+
+ if (hist_field->flags & HIST_FIELD_FL_CPU)
+ seq_puts(m, "cpu");
+ else if (field_name) {
+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
+ hist_field->flags & HIST_FIELD_FL_ALIAS)
+ seq_putc(m, '$');
+ seq_printf(m, "%s", field_name);
+ } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
+ seq_puts(m, "common_timestamp");
+
if (hist_field->flags) {
- const char *flags_str = get_hist_field_flags(hist_field);
+ if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
+ !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
+ const char *flags = get_hist_field_flags(hist_field);
- if (flags_str)
- seq_printf(m, ".%s", flags_str);
+ if (flags)
+ seq_printf(m, ".%s", flags);
+ }
}
}
@@ -1200,7 +4932,8 @@ static int event_hist_trigger_print(struct seq_file *m,
struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
- struct hist_field *key_field;
+ struct hist_field *field;
+ bool have_var = false;
unsigned int i;
seq_puts(m, "hist:");
@@ -1211,25 +4944,47 @@ static int event_hist_trigger_print(struct seq_file *m,
seq_puts(m, "keys=");
for_each_hist_key_field(i, hist_data) {
- key_field = hist_data->fields[i];
+ field = hist_data->fields[i];
if (i > hist_data->n_vals)
seq_puts(m, ",");
- if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
+ if (field->flags & HIST_FIELD_FL_STACKTRACE)
seq_puts(m, "stacktrace");
else
- hist_field_print(m, key_field);
+ hist_field_print(m, field);
}
seq_puts(m, ":vals=");
for_each_hist_val_field(i, hist_data) {
+ field = hist_data->fields[i];
+ if (field->flags & HIST_FIELD_FL_VAR) {
+ have_var = true;
+ continue;
+ }
+
if (i == HITCOUNT_IDX)
seq_puts(m, "hitcount");
else {
seq_puts(m, ",");
- hist_field_print(m, hist_data->fields[i]);
+ hist_field_print(m, field);
+ }
+ }
+
+ if (have_var) {
+ unsigned int n = 0;
+
+ seq_puts(m, ":");
+
+ for_each_hist_val_field(i, hist_data) {
+ field = hist_data->fields[i];
+
+ if (field->flags & HIST_FIELD_FL_VAR) {
+ if (n++)
+ seq_puts(m, ",");
+ hist_field_print(m, field);
+ }
}
}
@@ -1237,28 +4992,36 @@ static int event_hist_trigger_print(struct seq_file *m,
for (i = 0; i < hist_data->n_sort_keys; i++) {
struct tracing_map_sort_key *sort_key;
+ unsigned int idx, first_key_idx;
+
+ /* skip VAR vals */
+ first_key_idx = hist_data->n_vals - hist_data->n_vars;
sort_key = &hist_data->sort_keys[i];
+ idx = sort_key->field_idx;
+
+ if (WARN_ON(idx >= HIST_FIELDS_MAX))
+ return -EINVAL;
if (i > 0)
seq_puts(m, ",");
- if (sort_key->field_idx == HITCOUNT_IDX)
+ if (idx == HITCOUNT_IDX)
seq_puts(m, "hitcount");
else {
- unsigned int idx = sort_key->field_idx;
-
- if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
- return -EINVAL;
-
+ if (idx >= first_key_idx)
+ idx += hist_data->n_vars;
hist_field_print(m, hist_data->fields[idx]);
}
if (sort_key->descending)
seq_puts(m, ".descending");
}
-
seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
+ if (hist_data->enable_timestamps)
+ seq_printf(m, ":clock=%s", hist_data->attrs->clock);
+
+ print_actions_spec(m, hist_data);
if (data->filter_str)
seq_printf(m, " if %s", data->filter_str);
@@ -1286,6 +5049,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops,
return 0;
}
+static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
+{
+ struct trace_event_file *file;
+ unsigned int i;
+ char *cmd;
+ int ret;
+
+ for (i = 0; i < hist_data->n_field_var_hists; i++) {
+ file = hist_data->field_var_hists[i]->hist_data->event_file;
+ cmd = hist_data->field_var_hists[i]->cmd;
+ ret = event_hist_trigger_func(&trigger_hist_cmd, file,
+ "!hist", "hist", cmd);
+ }
+}
+
static void event_hist_trigger_free(struct event_trigger_ops *ops,
struct event_trigger_data *data)
{
@@ -1298,7 +5076,13 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
if (!data->ref) {
if (data->name)
del_named_trigger(data);
+
trigger_data_free(data);
+
+ remove_hist_vars(hist_data);
+
+ unregister_field_var_hists(hist_data);
+
destroy_hist_data(hist_data);
}
}
@@ -1425,6 +5209,15 @@ static bool hist_trigger_match(struct event_trigger_data *data,
return false;
if (key_field->offset != key_field_test->offset)
return false;
+ if (key_field->size != key_field_test->size)
+ return false;
+ if (key_field->is_signed != key_field_test->is_signed)
+ return false;
+ if (!!key_field->var.name != !!key_field_test->var.name)
+ return false;
+ if (key_field->var.name &&
+ strcmp(key_field->var.name, key_field_test->var.name) != 0)
+ return false;
}
for (i = 0; i < hist_data->n_sort_keys; i++) {
@@ -1440,6 +5233,9 @@ static bool hist_trigger_match(struct event_trigger_data *data,
(strcmp(data->filter_str, data_test->filter_str) != 0))
return false;
+ if (!actions_match(hist_data, hist_data_test))
+ return false;
+
return true;
}
@@ -1456,6 +5252,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
if (named_data) {
if (!hist_trigger_match(data, named_data, named_data,
true)) {
+ hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
ret = -EINVAL;
goto out;
}
@@ -1475,13 +5272,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
test->paused = false;
else if (hist_data->attrs->clear)
hist_clear(test);
- else
+ else {
+ hist_err("Hist trigger already exists", NULL);
ret = -EEXIST;
+ }
goto out;
}
}
new:
if (hist_data->attrs->cont || hist_data->attrs->clear) {
+ hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
ret = -ENOENT;
goto out;
}
@@ -1490,7 +5290,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
data->paused = true;
if (named_data) {
- destroy_hist_data(data->private_data);
data->private_data = named_data->private_data;
set_named_trigger_data(data, named_data);
data->ops = &event_hist_trigger_named_ops;
@@ -1502,8 +5301,32 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
goto out;
}
- list_add_rcu(&data->list, &file->triggers);
+ if (hist_data->enable_timestamps) {
+ char *clock = hist_data->attrs->clock;
+
+ ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
+ if (ret) {
+ hist_err("Couldn't set trace_clock: ", clock);
+ goto out;
+ }
+
+ tracing_set_time_stamp_abs(file->tr, true);
+ }
+
+ if (named_data)
+ destroy_hist_data(hist_data);
+
ret++;
+ out:
+ return ret;
+}
+
+static int hist_trigger_enable(struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ int ret = 0;
+
+ list_add_tail_rcu(&data->list, &file->triggers);
update_cond_flag(file);
@@ -1512,10 +5335,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
update_cond_flag(file);
ret--;
}
- out:
+
return ret;
}
+static bool have_hist_trigger_match(struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ struct event_trigger_data *test, *named_data = NULL;
+ bool match = false;
+
+ if (hist_data->attrs->name)
+ named_data = find_named_trigger(hist_data->attrs->name);
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (hist_trigger_match(data, test, named_data, false)) {
+ match = true;
+ break;
+ }
+ }
+ }
+
+ return match;
+}
+
+static bool hist_trigger_check_refs(struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data = data->private_data;
+ struct event_trigger_data *test, *named_data = NULL;
+
+ if (hist_data->attrs->name)
+ named_data = find_named_trigger(hist_data->attrs->name);
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ if (!hist_trigger_match(data, test, named_data, false))
+ continue;
+ hist_data = test->private_data;
+ if (check_var_refs(hist_data))
+ return true;
+ break;
+ }
+ }
+
+ return false;
+}
+
static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file)
@@ -1541,17 +5409,55 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
if (unregistered && test->ops->free)
test->ops->free(test->ops, test);
+
+ if (hist_data->enable_timestamps) {
+ if (!hist_data->remove || unregistered)
+ tracing_set_time_stamp_abs(file->tr, false);
+ }
+}
+
+static bool hist_file_check_refs(struct trace_event_file *file)
+{
+ struct hist_trigger_data *hist_data;
+ struct event_trigger_data *test;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = test->private_data;
+ if (check_var_refs(hist_data))
+ return true;
+ }
+ }
+
+ return false;
}
static void hist_unreg_all(struct trace_event_file *file)
{
struct event_trigger_data *test, *n;
+ struct hist_trigger_data *hist_data;
+ struct synth_event *se;
+ const char *se_name;
+
+ if (hist_file_check_refs(file))
+ return;
list_for_each_entry_safe(test, n, &file->triggers, list) {
if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = test->private_data;
list_del_rcu(&test->list);
trace_event_trigger_enable_disable(file, 0);
+
+ mutex_lock(&synth_event_mutex);
+ se_name = trace_event_name(file->event_call);
+ se = find_synth_event(se_name);
+ if (se)
+ se->ref--;
+ mutex_unlock(&synth_event_mutex);
+
update_cond_flag(file);
+ if (hist_data->enable_timestamps)
+ tracing_set_time_stamp_abs(file->tr, false);
if (test->ops->free)
test->ops->free(test->ops, test);
}
@@ -1567,16 +5473,54 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
struct hist_trigger_attrs *attrs;
struct event_trigger_ops *trigger_ops;
struct hist_trigger_data *hist_data;
- char *trigger;
+ struct synth_event *se;
+ const char *se_name;
+ bool remove = false;
+ char *trigger, *p;
int ret = 0;
+ if (glob && strlen(glob)) {
+ last_cmd_set(param);
+ hist_err_clear();
+ }
+
if (!param)
return -EINVAL;
- /* separate the trigger from the filter (k:v [if filter]) */
- trigger = strsep(&param, " \t");
- if (!trigger)
- return -EINVAL;
+ if (glob[0] == '!')
+ remove = true;
+
+ /*
+ * separate the trigger from the filter (k:v [if filter])
+ * allowing for whitespace in the trigger
+ */
+ p = trigger = param;
+ do {
+ p = strstr(p, "if");
+ if (!p)
+ break;
+ if (p == param)
+ return -EINVAL;
+ if (*(p - 1) != ' ' && *(p - 1) != '\t') {
+ p++;
+ continue;
+ }
+ if (p >= param + strlen(param) - strlen("if") - 1)
+ return -EINVAL;
+ if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
+ p++;
+ continue;
+ }
+ break;
+ } while (p);
+
+ if (!p)
+ param = NULL;
+ else {
+ *(p - 1) = '\0';
+ param = strstrip(p);
+ trigger = strstrip(trigger);
+ }
attrs = parse_hist_trigger_attrs(trigger);
if (IS_ERR(attrs))
@@ -1585,7 +5529,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
if (attrs->map_bits)
hist_trigger_bits = attrs->map_bits;
- hist_data = create_hist_data(hist_trigger_bits, attrs, file);
+ hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove);
if (IS_ERR(hist_data)) {
destroy_hist_trigger_attrs(attrs);
return PTR_ERR(hist_data);
@@ -1593,10 +5537,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
- ret = -ENOMEM;
trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
- if (!trigger_data)
+ if (!trigger_data) {
+ ret = -ENOMEM;
goto out_free;
+ }
trigger_data->count = -1;
trigger_data->ops = trigger_ops;
@@ -1614,8 +5559,24 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
goto out_free;
}
- if (glob[0] == '!') {
+ if (remove) {
+ if (!have_hist_trigger_match(trigger_data, file))
+ goto out_free;
+
+ if (hist_trigger_check_refs(trigger_data, file)) {
+ ret = -EBUSY;
+ goto out_free;
+ }
+
cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+
+ mutex_lock(&synth_event_mutex);
+ se_name = trace_event_name(file->event_call);
+ se = find_synth_event(se_name);
+ if (se)
+ se->ref--;
+ mutex_unlock(&synth_event_mutex);
+
ret = 0;
goto out_free;
}
@@ -1632,14 +5593,47 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
goto out_free;
} else if (ret < 0)
goto out_free;
+
+ if (get_named_trigger_data(trigger_data))
+ goto enable;
+
+ if (has_hist_vars(hist_data))
+ save_hist_vars(hist_data);
+
+ ret = create_actions(hist_data, file);
+ if (ret)
+ goto out_unreg;
+
+ ret = tracing_map_init(hist_data->map);
+ if (ret)
+ goto out_unreg;
+enable:
+ ret = hist_trigger_enable(trigger_data, file);
+ if (ret)
+ goto out_unreg;
+
+ mutex_lock(&synth_event_mutex);
+ se_name = trace_event_name(file->event_call);
+ se = find_synth_event(se_name);
+ if (se)
+ se->ref++;
+ mutex_unlock(&synth_event_mutex);
+
/* Just return zero, not the number of registered triggers */
ret = 0;
out:
+ if (ret == 0)
+ hist_err_clear();
+
return ret;
+ out_unreg:
+ cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
out_free:
if (cmd_ops->set_filter)
cmd_ops->set_filter(NULL, trigger_data, NULL);
+ remove_hist_vars(hist_data);
+
kfree(trigger_data);
destroy_hist_data(hist_data);
@@ -1669,7 +5663,8 @@ __init int register_trigger_hist_cmd(void)
}
static void
-hist_enable_trigger(struct event_trigger_data *data, void *rec)
+hist_enable_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
struct event_trigger_data *test;
@@ -1685,7 +5680,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec)
}
static void
-hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
+hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (!data->count)
return;
@@ -1693,7 +5689,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
if (data->count != -1)
(data->count)--;
- hist_enable_trigger(data, rec);
+ hist_enable_trigger(data, rec, event);
}
static struct event_trigger_ops hist_enable_trigger_ops = {
@@ -1798,3 +5794,31 @@ __init int register_trigger_hist_enable_disable_cmds(void)
return ret;
}
+
+static __init int trace_events_hist_init(void)
+{
+ struct dentry *entry = NULL;
+ struct dentry *d_tracer;
+ int err = 0;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer)) {
+ err = PTR_ERR(d_tracer);
+ goto err;
+ }
+
+ entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
+ NULL, &synth_events_fops);
+ if (!entry) {
+ err = -ENODEV;
+ goto err;
+ }
+
+ return err;
+ err:
+ pr_warn("Could not create tracefs 'synthetic_events' entry\n");
+
+ return err;
+}
+
+fs_initcall(trace_events_hist_init);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 87411482a46f..d18249683682 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data)
* any trigger that should be deferred, ETT_NONE if nothing to defer.
*/
enum event_trigger_type
-event_triggers_call(struct trace_event_file *file, void *rec)
+event_triggers_call(struct trace_event_file *file, void *rec,
+ struct ring_buffer_event *event)
{
struct event_trigger_data *data;
enum event_trigger_type tt = ETT_NONE;
@@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
if (data->paused)
continue;
if (!rec) {
- data->ops->func(data, rec);
+ data->ops->func(data, rec, event);
continue;
}
filter = rcu_dereference_sched(data->filter);
@@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
tt |= data->cmd_ops->trigger_type;
continue;
}
- data->ops->func(data, rec);
+ data->ops->func(data, rec, event);
}
return tt;
}
@@ -96,7 +97,6 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
* event_triggers_post_call - Call 'post_triggers' for a trace event
* @file: The trace_event_file associated with the event
* @tt: enum event_trigger_type containing a set bit for each trigger to invoke
- * @rec: The trace entry for the event
*
* For each trigger associated with an event, invoke the trigger
* function registered with the associated trigger command, if the
@@ -107,8 +107,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
*/
void
event_triggers_post_call(struct trace_event_file *file,
- enum event_trigger_type tt,
- void *rec)
+ enum event_trigger_type tt)
{
struct event_trigger_data *data;
@@ -116,7 +115,7 @@ event_triggers_post_call(struct trace_event_file *file,
if (data->paused)
continue;
if (data->cmd_ops->trigger_type & tt)
- data->ops->func(data, rec);
+ data->ops->func(data, NULL, NULL);
}
}
EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -482,9 +481,10 @@ clear_event_triggers(struct trace_array *tr)
struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list) {
- struct event_trigger_data *data;
- list_for_each_entry_rcu(data, &file->triggers, list) {
+ struct event_trigger_data *data, *n;
+ list_for_each_entry_safe(data, n, &file->triggers, list) {
trace_event_trigger_enable_disable(file, 0);
+ list_del_rcu(&data->list);
if (data->ops->free)
data->ops->free(data->ops, data);
}
@@ -641,6 +641,7 @@ event_trigger_callback(struct event_command *cmd_ops,
trigger_data->count = -1;
trigger_data->ops = trigger_ops;
trigger_data->cmd_ops = cmd_ops;
+ trigger_data->private_data = file;
INIT_LIST_HEAD(&trigger_data->list);
INIT_LIST_HEAD(&trigger_data->named_list);
@@ -908,8 +909,15 @@ void set_named_trigger_data(struct event_trigger_data *data,
data->named_data = named_data;
}
+struct event_trigger_data *
+get_named_trigger_data(struct event_trigger_data *data)
+{
+ return data->named_data;
+}
+
static void
-traceon_trigger(struct event_trigger_data *data, void *rec)
+traceon_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (tracing_is_on())
return;
@@ -918,7 +926,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec)
}
static void
-traceon_count_trigger(struct event_trigger_data *data, void *rec)
+traceon_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (tracing_is_on())
return;
@@ -933,7 +942,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec)
}
static void
-traceoff_trigger(struct event_trigger_data *data, void *rec)
+traceoff_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (!tracing_is_on())
return;
@@ -942,7 +952,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec)
}
static void
-traceoff_count_trigger(struct event_trigger_data *data, void *rec)
+traceoff_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (!tracing_is_on())
return;
@@ -1039,13 +1050,20 @@ static struct event_command trigger_traceoff_cmd = {
#ifdef CONFIG_TRACER_SNAPSHOT
static void
-snapshot_trigger(struct event_trigger_data *data, void *rec)
+snapshot_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
- tracing_snapshot();
+ struct trace_event_file *file = data->private_data;
+
+ if (file)
+ tracing_snapshot_instance(file->tr);
+ else
+ tracing_snapshot();
}
static void
-snapshot_count_trigger(struct event_trigger_data *data, void *rec)
+snapshot_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (!data->count)
return;
@@ -1053,7 +1071,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec)
if (data->count != -1)
(data->count)--;
- snapshot_trigger(data, rec);
+ snapshot_trigger(data, rec, event);
}
static int
@@ -1063,7 +1081,7 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
{
int ret = register_trigger(glob, ops, data, file);
- if (ret > 0 && tracing_alloc_snapshot() != 0) {
+ if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) {
unregister_trigger(glob, ops, data, file);
ret = 0;
}
@@ -1141,13 +1159,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
#endif
static void
-stacktrace_trigger(struct event_trigger_data *data, void *rec)
+stacktrace_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
trace_dump_stack(STACK_SKIP);
}
static void
-stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
+stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
if (!data->count)
return;
@@ -1155,7 +1175,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
if (data->count != -1)
(data->count)--;
- stacktrace_trigger(data, rec);
+ stacktrace_trigger(data, rec, event);
}
static int
@@ -1217,7 +1237,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
}
static void
-event_enable_trigger(struct event_trigger_data *data, void *rec)
+event_enable_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1228,7 +1249,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec)
}
static void
-event_enable_count_trigger(struct event_trigger_data *data, void *rec)
+event_enable_count_trigger(struct event_trigger_data *data, void *rec,
+ struct ring_buffer_event *event)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1242,7 +1264,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
if (data->count != -1)
(data->count)--;
- event_enable_trigger(data, rec);
+ event_enable_trigger(data, rec, event);
}
int event_enable_trigger_print(struct seq_file *m,
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 548e62eb5c46..45630a76ed3a 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -14,6 +14,13 @@
#include "trace_output.h"
+/* Stub function for events with triggers */
+static int ftrace_event_register(struct trace_event_call *call,
+ enum trace_reg type, void *data)
+{
+ return 0;
+}
+
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ftrace
@@ -117,7 +124,7 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __dynamic_array
#define __dynamic_array(type, item) \
- ret = trace_define_field(event_call, #type, #item, \
+ ret = trace_define_field(event_call, #type "[]", #item, \
offsetof(typeof(field), item), \
0, is_signed_type(type), filter_type);\
if (ret) \
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1fad24acd444..daa81571b22a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -462,6 +462,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
disable_kprobe(&tk->rp.kp);
wait = 1;
}
+
+ /*
+ * if tk is not added to any list, it must be a local trace_kprobe
+ * created with perf_event_open. We don't need to wait for these
+ * trace_kprobes
+ */
+ if (list_empty(&tk->list))
+ wait = 0;
out:
if (wait) {
/*
@@ -504,8 +512,6 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
if (ret == 0)
tk->tp.flags |= TP_FLAG_REGISTERED;
else {
- pr_warn("Could not insert probe at %s+%lu: %d\n",
- trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
ret = 0;
@@ -659,7 +665,7 @@ static int create_trace_kprobe(int argc, char **argv)
char *symbol = NULL, *event = NULL, *group = NULL;
int maxactive = 0;
char *arg;
- unsigned long offset = 0;
+ long offset = 0;
void *addr = NULL;
char buf[MAX_EVENT_NAME_LEN];
@@ -747,7 +753,7 @@ static int create_trace_kprobe(int argc, char **argv)
symbol = argv[1];
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
- if (ret) {
+ if (ret || offset < 0 || offset > UINT_MAX) {
pr_info("Failed to parse either an address or a symbol.\n");
return ret;
}
@@ -1281,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
head, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
+
+int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
+ const char **symbol, u64 *probe_offset,
+ u64 *probe_addr, bool perf_type_tracepoint)
+{
+ const char *pevent = trace_event_name(event->tp_event);
+ const char *group = event->tp_event->class->system;
+ struct trace_kprobe *tk;
+
+ if (perf_type_tracepoint)
+ tk = find_trace_kprobe(pevent, group);
+ else
+ tk = event->tp_event->data;
+ if (!tk)
+ return -EINVAL;
+
+ *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
+ : BPF_FD_TYPE_KPROBE;
+ if (tk->symbol) {
+ *symbol = tk->symbol;
+ *probe_offset = tk->rp.kp.offset;
+ *probe_addr = 0;
+ } else {
+ *symbol = NULL;
+ *probe_offset = 0;
+ *probe_addr = (unsigned long)tk->rp.kp.addr;
+ }
+ return 0;
+}
#endif /* CONFIG_PERF_EVENTS */
/*
@@ -1358,12 +1393,9 @@ static struct trace_event_functions kprobe_funcs = {
.trace = print_kprobe_event
};
-static int register_kprobe_event(struct trace_kprobe *tk)
+static inline void init_trace_event_call(struct trace_kprobe *tk,
+ struct trace_event_call *call)
{
- struct trace_event_call *call = &tk->tp.call;
- int ret;
-
- /* Initialize trace_event_call */
INIT_LIST_HEAD(&call->class->fields);
if (trace_kprobe_is_return(tk)) {
call->event.funcs = &kretprobe_funcs;
@@ -1372,6 +1404,19 @@ static int register_kprobe_event(struct trace_kprobe *tk)
call->event.funcs = &kprobe_funcs;
call->class->define_fields = kprobe_event_define_fields;
}
+
+ call->flags = TRACE_EVENT_FL_KPROBE;
+ call->class->reg = kprobe_register;
+ call->data = tk;
+}
+
+static int register_kprobe_event(struct trace_kprobe *tk)
+{
+ struct trace_event_call *call = &tk->tp.call;
+ int ret = 0;
+
+ init_trace_event_call(tk, call);
+
if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
return -ENOMEM;
ret = register_trace_event(&call->event);
@@ -1379,9 +1424,6 @@ static int register_kprobe_event(struct trace_kprobe *tk)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = TRACE_EVENT_FL_KPROBE;
- call->class->reg = kprobe_register;
- call->data = tk;
ret = trace_add_event_call(call);
if (ret) {
pr_info("Failed to register kprobe event: %s\n",
@@ -1403,6 +1445,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
return ret;
}
+#ifdef CONFIG_PERF_EVENTS
+/* create a trace_kprobe, but don't add it to global lists */
+struct trace_event_call *
+create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
+ bool is_return)
+{
+ struct trace_kprobe *tk;
+ int ret;
+ char *event;
+
+ /*
+ * local trace_kprobes are not added to probe_list, so they are never
+ * searched in find_trace_kprobe(). Therefore, there is no concern of
+ * duplicated name here.
+ */
+ event = func ? func : "DUMMY_EVENT";
+
+ tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func,
+ offs, 0 /* maxactive */, 0 /* nargs */,
+ is_return);
+
+ if (IS_ERR(tk)) {
+ pr_info("Failed to allocate trace_probe.(%d)\n",
+ (int)PTR_ERR(tk));
+ return ERR_CAST(tk);
+ }
+
+ init_trace_event_call(tk, &tk->tp.call);
+
+ if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = __register_trace_kprobe(tk);
+ if (ret < 0)
+ goto error;
+
+ return &tk->tp.call;
+error:
+ free_trace_kprobe(tk);
+ return ERR_PTR(ret);
+}
+
+void destroy_local_trace_kprobe(struct trace_event_call *event_call)
+{
+ struct trace_kprobe *tk;
+
+ tk = container_of(event_call, struct trace_kprobe, tp.call);
+
+ if (trace_probe_is_enabled(&tk->tp)) {
+ WARN_ON(1);
+ return;
+ }
+
+ __unregister_trace_kprobe(tk);
+ free_trace_kprobe(tk);
+}
+#endif /* CONFIG_PERF_EVENTS */
+
/* Make a tracefs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index ad1d6164e946..50f44b7b2b32 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -196,7 +196,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
};
int __trace_bprintk(unsigned long ip, const char *fmt, ...)
- {
+{
int ret;
va_list ap;
@@ -214,7 +214,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...)
EXPORT_SYMBOL_GPL(__trace_bprintk);
int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
- {
+{
if (unlikely(!fmt))
return 0;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d59357308677..daf54bda4dc8 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -320,7 +320,7 @@ static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
}
/* Split symbol and offset. */
-int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
+int traceprobe_split_symbol_offset(char *symbol, long *offset)
{
char *tmp;
int ret;
@@ -328,13 +328,11 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
if (!offset)
return -EINVAL;
- tmp = strchr(symbol, '+');
+ tmp = strpbrk(symbol, "+-");
if (tmp) {
- /* skip sign because kstrtoul doesn't accept '+' */
- ret = kstrtoul(tmp + 1, 0, offset);
+ ret = kstrtol(tmp, 0, offset);
if (ret)
return ret;
-
*tmp = '\0';
} else
*offset = 0;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index e101c5bb9eda..75daff22ccea 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -365,7 +365,7 @@ extern int traceprobe_conflict_field_name(const char *name,
extern void traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
-extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
+extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
/* Sum up total data length for dynamic arraies (strings) */
static nokprobe_inline int
@@ -416,3 +416,14 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
}
extern int set_print_fmt(struct trace_probe *tp, bool is_return);
+
+#ifdef CONFIG_PERF_EVENTS
+extern struct trace_event_call *
+create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
+ bool is_return);
+extern void destroy_local_trace_kprobe(struct trace_event_call *event_call);
+
+extern struct trace_event_call *
+create_local_trace_uprobe(char *name, unsigned long offs, bool is_return);
+extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
+#endif
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3c7bfc4bf5e9..4237eba4ef20 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -472,7 +472,7 @@ static __init int stack_trace_init(void)
NULL, &stack_trace_fops);
#ifdef CONFIG_DYNAMIC_FTRACE
- trace_create_file("stack_trace_filter", 0444, d_tracer,
+ trace_create_file("stack_trace_filter", 0644, d_tracer,
&trace_ops, &stack_trace_filter_fops);
#endif
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 268029ae1be6..bf89a51e740d 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -55,6 +55,7 @@ struct trace_uprobe {
struct list_head list;
struct trace_uprobe_filter filter;
struct uprobe_consumer consumer;
+ struct path path;
struct inode *inode;
char *filename;
unsigned long offset;
@@ -151,6 +152,8 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
return;
ret = strncpy_from_user(dst, src, maxlen);
+ if (ret == maxlen)
+ dst[--ret] = '\0';
if (ret < 0) { /* Failed to fetch string */
((u8 *)get_rloc_data(dest))[0] = '\0';
@@ -287,7 +290,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
for (i = 0; i < tu->tp.nr_args; i++)
traceprobe_free_probe_arg(&tu->tp.args[i]);
- iput(tu->inode);
+ path_put(&tu->path);
kfree(tu->tp.call.class->system);
kfree(tu->tp.call.name);
kfree(tu->filename);
@@ -361,7 +364,6 @@ end:
static int create_trace_uprobe(int argc, char **argv)
{
struct trace_uprobe *tu;
- struct inode *inode;
char *arg, *event, *group, *filename;
char buf[MAX_EVENT_NAME_LEN];
struct path path;
@@ -369,7 +371,6 @@ static int create_trace_uprobe(int argc, char **argv)
bool is_delete, is_return;
int i, ret;
- inode = NULL;
ret = 0;
is_delete = false;
is_return = false;
@@ -435,21 +436,16 @@ static int create_trace_uprobe(int argc, char **argv)
}
/* Find the last occurrence, in case the path contains ':' too. */
arg = strrchr(argv[1], ':');
- if (!arg) {
- ret = -EINVAL;
- goto fail_address_parse;
- }
+ if (!arg)
+ return -EINVAL;
*arg++ = '\0';
filename = argv[1];
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret)
- goto fail_address_parse;
-
- inode = igrab(d_inode(path.dentry));
- path_put(&path);
+ return ret;
- if (!inode || !S_ISREG(inode->i_mode)) {
+ if (!d_is_reg(path.dentry)) {
ret = -EINVAL;
goto fail_address_parse;
}
@@ -488,7 +484,7 @@ static int create_trace_uprobe(int argc, char **argv)
goto fail_address_parse;
}
tu->offset = offset;
- tu->inode = inode;
+ tu->path = path;
tu->filename = kstrdup(filename, GFP_KERNEL);
if (!tu->filename) {
@@ -556,7 +552,7 @@ error:
return ret;
fail_address_parse:
- iput(inode);
+ path_put(&path);
pr_info("Failed to parse address or file.\n");
@@ -602,24 +598,9 @@ static int probes_seq_show(struct seq_file *m, void *v)
char c = is_ret_probe(tu) ? 'r' : 'p';
int i;
- seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
- trace_event_name(&tu->tp.call));
- seq_printf(m, " %s:", tu->filename);
-
- /* Don't print "0x (null)" when offset is 0 */
- if (tu->offset) {
- seq_printf(m, "0x%px", (void *)tu->offset);
- } else {
- switch (sizeof(void *)) {
- case 4:
- seq_printf(m, "0x00000000");
- break;
- case 8:
- default:
- seq_printf(m, "0x0000000000000000");
- break;
- }
- }
+ seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system,
+ trace_event_name(&tu->tp.call), tu->filename,
+ (int)(sizeof(void *) * 2), tu->offset);
for (i = 0; i < tu->tp.nr_args; i++)
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
@@ -935,6 +916,7 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
goto err_flags;
tu->consumer.filter = filter;
+ tu->inode = d_real_inode(tu->path.dentry);
ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
if (ret)
goto err_buffer;
@@ -980,6 +962,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+ tu->inode = NULL;
tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
uprobe_buffer_disable();
@@ -1178,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
{
__uprobe_perf_func(tu, func, regs, ucb, dsize);
}
+
+int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
+ const char **filename, u64 *probe_offset,
+ bool perf_type_tracepoint)
+{
+ const char *pevent = trace_event_name(event->tp_event);
+ const char *group = event->tp_event->class->system;
+ struct trace_uprobe *tu;
+
+ if (perf_type_tracepoint)
+ tu = find_probe_event(pevent, group);
+ else
+ tu = event->tp_event->data;
+ if (!tu)
+ return -EINVAL;
+
+ *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
+ : BPF_FD_TYPE_UPROBE;
+ *filename = tu->filename;
+ *probe_offset = tu->offset;
+ return 0;
+}
#endif /* CONFIG_PERF_EVENTS */
static int
@@ -1292,16 +1297,25 @@ static struct trace_event_functions uprobe_funcs = {
.trace = print_uprobe_event
};
-static int register_uprobe_event(struct trace_uprobe *tu)
+static inline void init_trace_event_call(struct trace_uprobe *tu,
+ struct trace_event_call *call)
{
- struct trace_event_call *call = &tu->tp.call;
- int ret;
-
- /* Initialize trace_event_call */
INIT_LIST_HEAD(&call->class->fields);
call->event.funcs = &uprobe_funcs;
call->class->define_fields = uprobe_event_define_fields;
+ call->flags = TRACE_EVENT_FL_UPROBE;
+ call->class->reg = trace_uprobe_register;
+ call->data = tu;
+}
+
+static int register_uprobe_event(struct trace_uprobe *tu)
+{
+ struct trace_event_call *call = &tu->tp.call;
+ int ret = 0;
+
+ init_trace_event_call(tu, call);
+
if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
return -ENOMEM;
@@ -1311,9 +1325,6 @@ static int register_uprobe_event(struct trace_uprobe *tu)
return -ENODEV;
}
- call->flags = TRACE_EVENT_FL_UPROBE;
- call->class->reg = trace_uprobe_register;
- call->data = tu;
ret = trace_add_event_call(call);
if (ret) {
@@ -1339,6 +1350,67 @@ static int unregister_uprobe_event(struct trace_uprobe *tu)
return 0;
}
+#ifdef CONFIG_PERF_EVENTS
+struct trace_event_call *
+create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
+{
+ struct trace_uprobe *tu;
+ struct path path;
+ int ret;
+
+ ret = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!d_is_reg(path.dentry)) {
+ path_put(&path);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * local trace_kprobes are not added to probe_list, so they are never
+ * searched in find_trace_kprobe(). Therefore, there is no concern of
+ * duplicated name "DUMMY_EVENT" here.
+ */
+ tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0,
+ is_return);
+
+ if (IS_ERR(tu)) {
+ pr_info("Failed to allocate trace_uprobe.(%d)\n",
+ (int)PTR_ERR(tu));
+ path_put(&path);
+ return ERR_CAST(tu);
+ }
+
+ tu->offset = offs;
+ tu->path = path;
+ tu->filename = kstrdup(name, GFP_KERNEL);
+ init_trace_event_call(tu, &tu->tp.call);
+
+ if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ return &tu->tp.call;
+error:
+ free_trace_uprobe(tu);
+ return ERR_PTR(ret);
+}
+
+void destroy_local_trace_uprobe(struct trace_event_call *event_call)
+{
+ struct trace_uprobe *tu;
+
+ tu = container_of(event_call, struct trace_uprobe, tp.call);
+
+ kfree(tu->tp.call.print_fmt);
+ tu->tp.call.print_fmt = NULL;
+
+ free_trace_uprobe(tu);
+}
+#endif /* CONFIG_PERF_EVENTS */
+
/* Make a trace interface for controling probe points */
static __init int init_uprobe_trace(void)
{
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 07e75344725b..5cadb1b8b5fe 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
return (u64)atomic64_read(&elt->fields[i].sum);
}
+/**
+ * tracing_map_set_var - Assign a tracing_map_elt's variable field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ * @n: The value to assign
+ *
+ * Assign n to variable i associated with the specified tracing_map_elt
+ * instance. The index i is the index returned by the call to
+ * tracing_map_add_var() when the tracing map was set up.
+ */
+void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n)
+{
+ atomic64_set(&elt->vars[i], n);
+ elt->var_set[i] = true;
+}
+
+/**
+ * tracing_map_var_set - Return whether or not a variable has been set
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ *
+ * Return true if the variable has been set, false otherwise. The
+ * index i is the index returned by the call to tracing_map_add_var()
+ * when the tracing map was set up.
+ */
+bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i)
+{
+ return elt->var_set[i];
+}
+
+/**
+ * tracing_map_read_var - Return the value of a tracing_map_elt's variable field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ *
+ * Retrieve the value of the variable i associated with the specified
+ * tracing_map_elt instance. The index i is the index returned by the
+ * call to tracing_map_add_var() when the tracing map was set
+ * up.
+ *
+ * Return: The variable value associated with field i for elt.
+ */
+u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i)
+{
+ return (u64)atomic64_read(&elt->vars[i]);
+}
+
+/**
+ * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ *
+ * Retrieve the value of the variable i associated with the specified
+ * tracing_map_elt instance, and reset the variable to the 'not set'
+ * state. The index i is the index returned by the call to
+ * tracing_map_add_var() when the tracing map was set up. The reset
+ * essentially makes the variable a read-once variable if it's only
+ * accessed using this function.
+ *
+ * Return: The variable value associated with field i for elt.
+ */
+u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
+{
+ elt->var_set[i] = false;
+ return (u64)atomic64_read(&elt->vars[i]);
+}
+
int tracing_map_cmp_string(void *val_a, void *val_b)
{
char *a = val_a;
@@ -171,6 +238,28 @@ int tracing_map_add_sum_field(struct tracing_map *map)
}
/**
+ * tracing_map_add_var - Add a field describing a tracing_map var
+ * @map: The tracing_map
+ *
+ * Add a var to the map and return the index identifying it in the map
+ * and associated tracing_map_elts. This is the index used for
+ * instance to update a var for a particular tracing_map_elt using
+ * tracing_map_update_var() or reading it via tracing_map_read_var().
+ *
+ * Return: The index identifying the var in the map and associated
+ * tracing_map_elts, or -EINVAL on error.
+ */
+int tracing_map_add_var(struct tracing_map *map)
+{
+ int ret = -EINVAL;
+
+ if (map->n_vars < TRACING_MAP_VARS_MAX)
+ ret = map->n_vars++;
+
+ return ret;
+}
+
+/**
* tracing_map_add_key_field - Add a field describing a tracing_map key
* @map: The tracing_map
* @offset: The offset within the key
@@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt)
if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
atomic64_set(&elt->fields[i].sum, 0);
+ for (i = 0; i < elt->map->n_vars; i++) {
+ atomic64_set(&elt->vars[i], 0);
+ elt->var_set[i] = false;
+ }
+
if (elt->map->ops && elt->map->ops->elt_clear)
elt->map->ops->elt_clear(elt);
}
@@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
if (elt->map->ops && elt->map->ops->elt_free)
elt->map->ops->elt_free(elt);
kfree(elt->fields);
+ kfree(elt->vars);
+ kfree(elt->var_set);
kfree(elt->key);
kfree(elt);
}
@@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
goto free;
}
+ elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
+ if (!elt->vars) {
+ err = -ENOMEM;
+ goto free;
+ }
+
+ elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
+ if (!elt->var_set) {
+ err = -ENOMEM;
+ goto free;
+ }
+
tracing_map_elt_init_fields(elt);
if (map->ops && map->ops->elt_alloc) {
@@ -414,7 +522,9 @@ static inline struct tracing_map_elt *
__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
{
u32 idx, key_hash, test_key;
+ int dup_try = 0;
struct tracing_map_entry *entry;
+ struct tracing_map_elt *val;
key_hash = jhash(key, map->key_size, 0);
if (key_hash == 0)
@@ -426,11 +536,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
entry = TRACING_MAP_ENTRY(map->map, idx);
test_key = entry->key;
- if (test_key && test_key == key_hash && entry->val &&
- keys_match(key, entry->val->key, map->key_size)) {
- if (!lookup_only)
- atomic64_inc(&map->hits);
- return entry->val;
+ if (test_key && test_key == key_hash) {
+ val = READ_ONCE(entry->val);
+ if (val &&
+ keys_match(key, val->key, map->key_size)) {
+ if (!lookup_only)
+ atomic64_inc(&map->hits);
+ return val;
+ } else if (unlikely(!val)) {
+ /*
+ * The key is present. But, val (pointer to elt
+ * struct) is still NULL. which means some other
+ * thread is in the process of inserting an
+ * element.
+ *
+ * On top of that, it's key_hash is same as the
+ * one being inserted right now. So, it's
+ * possible that the element has the same
+ * key as well.
+ */
+
+ dup_try++;
+ if (dup_try > map->map_size) {
+ atomic64_inc(&map->drops);
+ break;
+ }
+ continue;
+ }
}
if (!test_key) {
@@ -452,6 +584,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
atomic64_inc(&map->hits);
return entry->val;
+ } else {
+ /*
+ * cmpxchg() failed. Loop around once
+ * more to check what key was inserted.
+ */
+ dup_try++;
+ continue;
}
}
@@ -816,67 +955,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
return sort_entry;
}
-static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
-{
- struct tracing_map_elt *dup_elt;
- unsigned int i;
-
- dup_elt = tracing_map_elt_alloc(elt->map);
- if (IS_ERR(dup_elt))
- return NULL;
-
- if (elt->map->ops && elt->map->ops->elt_copy)
- elt->map->ops->elt_copy(dup_elt, elt);
-
- dup_elt->private_data = elt->private_data;
- memcpy(dup_elt->key, elt->key, elt->map->key_size);
-
- for (i = 0; i < elt->map->n_fields; i++) {
- atomic64_set(&dup_elt->fields[i].sum,
- atomic64_read(&elt->fields[i].sum));
- dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
- }
-
- return dup_elt;
-}
-
-static int merge_dup(struct tracing_map_sort_entry **sort_entries,
- unsigned int target, unsigned int dup)
-{
- struct tracing_map_elt *target_elt, *elt;
- bool first_dup = (target - dup) == 1;
- int i;
-
- if (first_dup) {
- elt = sort_entries[target]->elt;
- target_elt = copy_elt(elt);
- if (!target_elt)
- return -ENOMEM;
- sort_entries[target]->elt = target_elt;
- sort_entries[target]->elt_copied = true;
- } else
- target_elt = sort_entries[target]->elt;
-
- elt = sort_entries[dup]->elt;
-
- for (i = 0; i < elt->map->n_fields; i++)
- atomic64_add(atomic64_read(&elt->fields[i].sum),
- &target_elt->fields[i].sum);
-
- sort_entries[dup]->dup = true;
-
- return 0;
-}
-
-static int merge_dups(struct tracing_map_sort_entry **sort_entries,
+static void detect_dups(struct tracing_map_sort_entry **sort_entries,
int n_entries, unsigned int key_size)
{
unsigned int dups = 0, total_dups = 0;
- int err, i, j;
+ int i;
void *key;
if (n_entries < 2)
- return total_dups;
+ return;
sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
(int (*)(const void *, const void *))cmp_entries_dup, NULL);
@@ -885,30 +972,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries,
for (i = 1; i < n_entries; i++) {
if (!memcmp(sort_entries[i]->key, key, key_size)) {
dups++; total_dups++;
- err = merge_dup(sort_entries, i - dups, i);
- if (err)
- return err;
continue;
}
key = sort_entries[i]->key;
dups = 0;
}
- if (!total_dups)
- return total_dups;
-
- for (i = 0, j = 0; i < n_entries; i++) {
- if (!sort_entries[i]->dup) {
- sort_entries[j] = sort_entries[i];
- if (j++ != i)
- sort_entries[i] = NULL;
- } else {
- destroy_sort_entry(sort_entries[i]);
- sort_entries[i] = NULL;
- }
- }
-
- return total_dups;
+ WARN_ONCE(total_dups > 0,
+ "Duplicates detected: %d\n", total_dups);
}
static bool is_key(struct tracing_map *map, unsigned int field_idx)
@@ -1034,10 +1105,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
return 1;
}
- ret = merge_dups(entries, n_entries, map->key_size);
- if (ret < 0)
- goto free;
- n_entries -= ret;
+ detect_dups(entries, n_entries, map->key_size);
if (is_key(map, sort_keys[0].field_idx))
cmp_entries_fn = cmp_entries_key;
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 5b5bbf8ae550..053eb92b2d31 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -10,6 +10,7 @@
#define TRACING_MAP_VALS_MAX 3
#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
TRACING_MAP_VALS_MAX)
+#define TRACING_MAP_VARS_MAX 16
#define TRACING_MAP_SORT_KEYS_MAX 2
typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
@@ -137,6 +138,8 @@ struct tracing_map_field {
struct tracing_map_elt {
struct tracing_map *map;
struct tracing_map_field *fields;
+ atomic64_t *vars;
+ bool *var_set;
void *key;
void *private_data;
};
@@ -192,6 +195,7 @@ struct tracing_map {
int key_idx[TRACING_MAP_KEYS_MAX];
unsigned int n_keys;
struct tracing_map_sort_key sort_key;
+ unsigned int n_vars;
atomic64_t hits;
atomic64_t drops;
};
@@ -215,11 +219,6 @@ struct tracing_map {
* Element allocation occurs before tracing begins, when the
* tracing_map_init() call is made by client code.
*
- * @elt_copy: At certain points in the lifetime of an element, it may
- * need to be copied. The copy should include a copy of the
- * client-allocated data, which can be copied into the 'to'
- * element from the 'from' element.
- *
* @elt_free: When a tracing_map_elt is freed, this function is called
* and allows client-allocated per-element data to be freed.
*
@@ -233,8 +232,6 @@ struct tracing_map {
*/
struct tracing_map_ops {
int (*elt_alloc)(struct tracing_map_elt *elt);
- void (*elt_copy)(struct tracing_map_elt *to,
- struct tracing_map_elt *from);
void (*elt_free)(struct tracing_map_elt *elt);
void (*elt_clear)(struct tracing_map_elt *elt);
void (*elt_init)(struct tracing_map_elt *elt);
@@ -248,6 +245,7 @@ tracing_map_create(unsigned int map_bits,
extern int tracing_map_init(struct tracing_map *map);
extern int tracing_map_add_sum_field(struct tracing_map *map);
+extern int tracing_map_add_var(struct tracing_map *map);
extern int tracing_map_add_key_field(struct tracing_map *map,
unsigned int offset,
tracing_map_cmp_fn_t cmp_fn);
@@ -267,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b);
extern void tracing_map_update_sum(struct tracing_map_elt *elt,
unsigned int i, u64 n);
+extern void tracing_map_set_var(struct tracing_map_elt *elt,
+ unsigned int i, u64 n);
+extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
+extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
+extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
+
extern void tracing_map_set_field_descr(struct tracing_map *map,
unsigned int i,
unsigned int key_offset,
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 671b13457387..6dc6356c3327 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -207,7 +207,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
lockdep_is_held(&tracepoints_mutex));
old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
- WARN_ON_ONCE(1);
+ WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
@@ -239,7 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
lockdep_is_held(&tracepoints_mutex));
old = func_remove(&tp_funcs, func);
if (IS_ERR(old)) {
- WARN_ON_ONCE(1);
+ WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
@@ -257,7 +257,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
}
/**
- * tracepoint_probe_register - Connect a probe to a tracepoint
+ * tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority
* @tp: tracepoint
* @probe: probe handler
* @data: tracepoint data
@@ -290,7 +290,6 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio);
* @tp: tracepoint
* @probe: probe handler
* @data: tracepoint data
- * @prio: priority of this function over other registered functions
*
* Returns 0 if ok, error value on error.
* Note: if @tp is within a module, the caller is responsible for
diff --git a/kernel/ucount.c b/kernel/ucount.c
index b4eeee03934f..f48d1b6376a4 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -10,6 +10,7 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/hash.h>
+#include <linux/kmemleak.h>
#include <linux/user_namespace.h>
#define UCOUNTS_HASHTABLE_BITS 10
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ef1da2a5f9bd..af6925d8599b 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -18,44 +18,46 @@
#include <linux/uaccess.h>
+#include "uid16.h"
+
SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
{
- return sys_chown(filename, low2highuid(user), low2highgid(group));
+ return ksys_chown(filename, low2highuid(user), low2highgid(group));
}
SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
{
- return sys_lchown(filename, low2highuid(user), low2highgid(group));
+ return ksys_lchown(filename, low2highuid(user), low2highgid(group));
}
SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
{
- return sys_fchown(fd, low2highuid(user), low2highgid(group));
+ return ksys_fchown(fd, low2highuid(user), low2highgid(group));
}
SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
{
- return sys_setregid(low2highgid(rgid), low2highgid(egid));
+ return __sys_setregid(low2highgid(rgid), low2highgid(egid));
}
SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
{
- return sys_setgid(low2highgid(gid));
+ return __sys_setgid(low2highgid(gid));
}
SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
{
- return sys_setreuid(low2highuid(ruid), low2highuid(euid));
+ return __sys_setreuid(low2highuid(ruid), low2highuid(euid));
}
SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
{
- return sys_setuid(low2highuid(uid));
+ return __sys_setuid(low2highuid(uid));
}
SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
{
- return sys_setresuid(low2highuid(ruid), low2highuid(euid),
+ return __sys_setresuid(low2highuid(ruid), low2highuid(euid),
low2highuid(suid));
}
@@ -78,11 +80,10 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid
SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
{
- return sys_setresgid(low2highgid(rgid), low2highgid(egid),
+ return __sys_setresgid(low2highgid(rgid), low2highgid(egid),
low2highgid(sgid));
}
-
SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
{
const struct cred *cred = current_cred();
@@ -102,12 +103,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid
SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
{
- return sys_setfsuid(low2highuid(uid));
+ return __sys_setfsuid(low2highuid(uid));
}
SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
{
- return sys_setfsgid(low2highgid(gid));
+ return __sys_setfsgid(low2highgid(gid));
}
static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/uid16.h b/kernel/uid16.h
new file mode 100644
index 000000000000..cdca040f7602
--- /dev/null
+++ b/kernel/uid16.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_UID16_H
+#define LINUX_UID16_H
+
+long __sys_setuid(uid_t uid);
+long __sys_setgid(gid_t gid);
+long __sys_setreuid(uid_t ruid, uid_t euid);
+long __sys_setregid(gid_t rgid, gid_t egid);
+long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid);
+long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid);
+long __sys_setfsuid(uid_t uid);
+long __sys_setfsgid(gid_t gid);
+
+#endif /* LINUX_UID16_H */
diff --git a/kernel/umh.c b/kernel/umh.c
index 18e5fa4b0e71..30db93fd7e39 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -25,6 +25,8 @@
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
#include <trace/events/module.h>
@@ -97,9 +99,13 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
+ if (sub_info->file)
+ retval = do_execve_file(sub_info->file,
+ sub_info->argv, sub_info->envp);
+ else
+ retval = do_execve(getname_kernel(sub_info->path),
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp);
out:
sub_info->retval = retval;
/*
@@ -118,7 +124,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
pid_t pid;
- /* If SIGCLD is ignored sys_wait4 won't populate the status. */
+ /* If SIGCLD is ignored kernel_wait4 won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
if (pid < 0) {
@@ -135,7 +141,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
*
* Thus the __user pointer cast is valid here.
*/
- sys_wait4(pid, (int __user *)&ret, 0, NULL);
+ kernel_wait4(pid, (int __user *)&ret, 0, NULL);
/*
* If ret is 0, either call_usermodehelper_exec_async failed and
@@ -185,6 +191,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
if (pid < 0) {
sub_info->retval = pid;
umh_complete(sub_info);
+ } else {
+ sub_info->pid = pid;
}
}
}
@@ -393,6 +401,117 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
}
EXPORT_SYMBOL(call_usermodehelper_setup);
+struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *info), void *data)
+{
+ struct subprocess_info *sub_info;
+
+ sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
+ if (!sub_info)
+ return NULL;
+
+ INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+ sub_info->path = "none";
+ sub_info->file = file;
+ sub_info->init = init;
+ sub_info->cleanup = cleanup;
+ sub_info->data = data;
+ return sub_info;
+}
+
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+ struct umh_info *umh_info = info->data;
+ struct file *from_umh[2];
+ struct file *to_umh[2];
+ int err;
+
+ /* create pipe to send data to umh */
+ err = create_pipe_files(to_umh, 0);
+ if (err)
+ return err;
+ err = replace_fd(0, to_umh[0], 0);
+ fput(to_umh[0]);
+ if (err < 0) {
+ fput(to_umh[1]);
+ return err;
+ }
+
+ /* create pipe to receive data from umh */
+ err = create_pipe_files(from_umh, 0);
+ if (err) {
+ fput(to_umh[1]);
+ replace_fd(0, NULL, 0);
+ return err;
+ }
+ err = replace_fd(1, from_umh[1], 0);
+ fput(from_umh[1]);
+ if (err < 0) {
+ fput(to_umh[1]);
+ replace_fd(0, NULL, 0);
+ fput(from_umh[0]);
+ return err;
+ }
+
+ umh_info->pipe_to_umh = to_umh[1];
+ umh_info->pipe_from_umh = from_umh[0];
+ return 0;
+}
+
+static void umh_save_pid(struct subprocess_info *info)
+{
+ struct umh_info *umh_info = info->data;
+
+ umh_info->pid = info->pid;
+}
+
+/**
+ * fork_usermode_blob - fork a blob of bytes as a usermode process
+ * @data: a blob of bytes that can be do_execv-ed as a file
+ * @len: length of the blob
+ * @info: information about usermode process (shouldn't be NULL)
+ *
+ * Returns either negative error or zero which indicates success
+ * in executing a blob of bytes as a usermode process. In such
+ * case 'struct umh_info *info' is populated with two pipes
+ * and a pid of the process. The caller is responsible for health
+ * check of the user process, killing it via pid, and closing the
+ * pipes when user process is no longer needed.
+ */
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+{
+ struct subprocess_info *sub_info;
+ struct file *file;
+ ssize_t written;
+ loff_t pos = 0;
+ int err;
+
+ file = shmem_kernel_file_setup("", len, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ written = kernel_write(file, data, len, &pos);
+ if (written != len) {
+ err = written;
+ if (err >= 0)
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = -ENOMEM;
+ sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
+ umh_save_pid, info);
+ if (!sub_info)
+ goto out;
+
+ err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+out:
+ fput(file);
+ return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_blob);
+
/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocessa
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 913fe4336d2b..dcd6be1996fe 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -19,6 +19,8 @@
#include <linux/proc_ns.h>
#include <linux/sched/task.h>
+static struct kmem_cache *uts_ns_cache __ro_after_init;
+
static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
{
return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
@@ -33,7 +35,7 @@ static struct uts_namespace *create_uts_ns(void)
{
struct uts_namespace *uts_ns;
- uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
if (uts_ns)
kref_init(&uts_ns->kref);
return uts_ns;
@@ -42,7 +44,7 @@ static struct uts_namespace *create_uts_ns(void)
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
- * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
+ * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise
*/
static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
struct uts_namespace *old_ns)
@@ -75,7 +77,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
return ns;
fail_free:
- kfree(ns);
+ kmem_cache_free(uts_ns_cache, ns);
fail_dec:
dec_uts_namespaces(ucounts);
fail:
@@ -113,7 +115,7 @@ void free_uts_ns(struct kref *kref)
dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
- kfree(ns);
+ kmem_cache_free(uts_ns_cache, ns);
}
static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
@@ -169,3 +171,13 @@ const struct proc_ns_operations utsns_operations = {
.install = utsns_install,
.owner = utsns_owner,
};
+
+void __init uts_ns_init(void)
+{
+ uts_ns_cache = kmem_cache_create_usercopy(
+ "uts_namespace", sizeof(struct uts_namespace), 0,
+ SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct uts_namespace, name),
+ sizeof_field(struct uts_namespace, name),
+ NULL);
+}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bb9a519cbf50..9f9983b0a27d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -66,7 +66,7 @@ enum {
* be executing on any CPU. The pool behaves as an unbound one.
*
* Note that DISASSOCIATED should be flipped only while holding
- * attach_mutex to avoid changing binding state while
+ * wq_pool_attach_mutex to avoid changing binding state while
* worker_attach_to_pool() is in progress.
*/
POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */
@@ -123,7 +123,7 @@ enum {
* cpu or grabbing pool->lock is enough for read access. If
* POOL_DISASSOCIATED is set, it's identical to L.
*
- * A: pool->attach_mutex protected.
+ * A: wq_pool_attach_mutex protected.
*
* PL: wq_pool_mutex protected.
*
@@ -153,10 +153,9 @@ struct worker_pool {
unsigned long watchdog_ts; /* L: watchdog timestamp */
struct list_head worklist; /* L: list of pending works */
- int nr_workers; /* L: total number of workers */
- /* nr_idle includes the ones off idle_list for rebinding */
- int nr_idle; /* L: currently idle ones */
+ int nr_workers; /* L: total number of workers */
+ int nr_idle; /* L: currently idle workers */
struct list_head idle_list; /* X: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
@@ -166,9 +165,7 @@ struct worker_pool {
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
- /* see manage_workers() for details on the two manager mutexes */
struct worker *manager; /* L: purely informational */
- struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
@@ -299,6 +296,7 @@ static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
+static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
@@ -401,14 +399,14 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
* @worker: iteration cursor
* @pool: worker_pool to iterate workers of
*
- * This must be called with @pool->attach_mutex.
+ * This must be called with wq_pool_attach_mutex.
*
* The if/else clause exists only for the lockdep assertion and can be
* ignored.
*/
#define for_each_pool_worker(worker, pool) \
list_for_each_entry((worker), &(pool)->workers, node) \
- if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
+ if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
else
/**
@@ -1604,6 +1602,40 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL_GPL(mod_delayed_work_on);
+static void rcu_work_rcufn(struct rcu_head *rcu)
+{
+ struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);
+
+ /* read the comment in __queue_work() */
+ local_irq_disable();
+ __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
+ local_irq_enable();
+}
+
+/**
+ * queue_rcu_work - queue work after a RCU grace period
+ * @wq: workqueue to use
+ * @rwork: work to queue
+ *
+ * Return: %false if @rwork was already pending, %true otherwise. Note
+ * that a full RCU grace period is guaranteed only after a %true return.
+ * While @rwork is guarnateed to be executed after a %false return, the
+ * execution may happen before a full RCU grace period has passed.
+ */
+bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
+{
+ struct work_struct *work = &rwork->work;
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ rwork->wq = wq;
+ call_rcu(&rwork->rcu, rcu_work_rcufn);
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(queue_rcu_work);
+
/**
* worker_enter_idle - enter idle state
* @worker: worker which is entering idle state
@@ -1692,7 +1724,7 @@ static struct worker *alloc_worker(int node)
static void worker_attach_to_pool(struct worker *worker,
struct worker_pool *pool)
{
- mutex_lock(&pool->attach_mutex);
+ mutex_lock(&wq_pool_attach_mutex);
/*
* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
@@ -1701,37 +1733,40 @@ static void worker_attach_to_pool(struct worker *worker,
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
/*
- * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
- * stable across this function. See the comments above the
- * flag definition for details.
+ * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
+ * stable across this function. See the comments above the flag
+ * definition for details.
*/
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
list_add_tail(&worker->node, &pool->workers);
+ worker->pool = pool;
- mutex_unlock(&pool->attach_mutex);
+ mutex_unlock(&wq_pool_attach_mutex);
}
/**
* worker_detach_from_pool() - detach a worker from its pool
* @worker: worker which is attached to its pool
- * @pool: the pool @worker is attached to
*
* Undo the attaching which had been done in worker_attach_to_pool(). The
* caller worker shouldn't access to the pool after detached except it has
* other reference to the pool.
*/
-static void worker_detach_from_pool(struct worker *worker,
- struct worker_pool *pool)
+static void worker_detach_from_pool(struct worker *worker)
{
+ struct worker_pool *pool = worker->pool;
struct completion *detach_completion = NULL;
- mutex_lock(&pool->attach_mutex);
+ mutex_lock(&wq_pool_attach_mutex);
+
list_del(&worker->node);
+ worker->pool = NULL;
+
if (list_empty(&pool->workers))
detach_completion = pool->detach_completion;
- mutex_unlock(&pool->attach_mutex);
+ mutex_unlock(&wq_pool_attach_mutex);
/* clear leftover flags without pool->lock after it is detached */
worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
@@ -1767,7 +1802,6 @@ static struct worker *create_worker(struct worker_pool *pool)
if (!worker)
goto fail;
- worker->pool = pool;
worker->id = id;
if (pool->cpu >= 0)
@@ -2054,6 +2088,12 @@ __acquires(&pool->lock)
worker->current_pwq = pwq;
work_color = get_work_color(work);
+ /*
+ * Record wq name for cmdline and debug reporting, may get
+ * overridden through set_worker_desc().
+ */
+ strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
+
list_del_init(&work->entry);
/*
@@ -2149,7 +2189,6 @@ __acquires(&pool->lock)
worker->current_work = NULL;
worker->current_func = NULL;
worker->current_pwq = NULL;
- worker->desc_valid = false;
pwq_dec_nr_in_flight(pwq, work_color);
}
@@ -2174,6 +2213,16 @@ static void process_scheduled_works(struct worker *worker)
}
}
+static void set_pf_worker(bool val)
+{
+ mutex_lock(&wq_pool_attach_mutex);
+ if (val)
+ current->flags |= PF_WQ_WORKER;
+ else
+ current->flags &= ~PF_WQ_WORKER;
+ mutex_unlock(&wq_pool_attach_mutex);
+}
+
/**
* worker_thread - the worker thread function
* @__worker: self
@@ -2192,7 +2241,7 @@ static int worker_thread(void *__worker)
struct worker_pool *pool = worker->pool;
/* tell the scheduler that this is a workqueue worker */
- worker->task->flags |= PF_WQ_WORKER;
+ set_pf_worker(true);
woke_up:
spin_lock_irq(&pool->lock);
@@ -2200,11 +2249,11 @@ woke_up:
if (unlikely(worker->flags & WORKER_DIE)) {
spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
- worker->task->flags &= ~PF_WQ_WORKER;
+ set_pf_worker(false);
set_task_comm(worker->task, "kworker/dying");
ida_simple_remove(&pool->worker_ida, worker->id);
- worker_detach_from_pool(worker, pool);
+ worker_detach_from_pool(worker);
kfree(worker);
return 0;
}
@@ -2303,7 +2352,7 @@ static int rescuer_thread(void *__rescuer)
* Mark rescuer as worker too. As WORKER_PREP is never cleared, it
* doesn't participate in concurrency management.
*/
- rescuer->task->flags |= PF_WQ_WORKER;
+ set_pf_worker(true);
repeat:
set_current_state(TASK_IDLE);
@@ -2335,7 +2384,6 @@ repeat:
worker_attach_to_pool(rescuer, pool);
spin_lock_irq(&pool->lock);
- rescuer->pool = pool;
/*
* Slurp in all works issued via this workqueue and
@@ -2385,10 +2433,9 @@ repeat:
if (need_more_worker(pool))
wake_up_worker(pool);
- rescuer->pool = NULL;
spin_unlock_irq(&pool->lock);
- worker_detach_from_pool(rescuer, pool);
+ worker_detach_from_pool(rescuer);
spin_lock_irq(&wq_mayday_lock);
}
@@ -2397,7 +2444,7 @@ repeat:
if (should_stop) {
__set_current_state(TASK_RUNNING);
- rescuer->task->flags &= ~PF_WQ_WORKER;
+ set_pf_worker(false);
return 0;
}
@@ -3001,6 +3048,26 @@ bool flush_delayed_work(struct delayed_work *dwork)
}
EXPORT_SYMBOL(flush_delayed_work);
+/**
+ * flush_rcu_work - wait for a rwork to finish executing the last queueing
+ * @rwork: the rcu work to flush
+ *
+ * Return:
+ * %true if flush_rcu_work() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_rcu_work(struct rcu_work *rwork)
+{
+ if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
+ rcu_barrier();
+ flush_work(&rwork->work);
+ return true;
+ } else {
+ return flush_work(&rwork->work);
+ }
+}
+EXPORT_SYMBOL(flush_rcu_work);
+
static bool __cancel_work(struct work_struct *work, bool is_dwork)
{
unsigned long flags;
@@ -3018,14 +3085,6 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
return ret;
}
-/*
- * See cancel_delayed_work()
- */
-bool cancel_work(struct work_struct *work)
-{
- return __cancel_work(work, false);
-}
-
/**
* cancel_delayed_work - cancel a delayed work
* @dwork: delayed_work to cancel
@@ -3227,7 +3286,6 @@ static int init_worker_pool(struct worker_pool *pool)
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
- mutex_init(&pool->attach_mutex);
INIT_LIST_HEAD(&pool->workers);
ida_init(&pool->worker_ida);
@@ -3310,10 +3368,10 @@ static void put_unbound_pool(struct worker_pool *pool)
WARN_ON(pool->nr_workers || pool->nr_idle);
spin_unlock_irq(&pool->lock);
- mutex_lock(&pool->attach_mutex);
+ mutex_lock(&wq_pool_attach_mutex);
if (!list_empty(&pool->workers))
pool->detach_completion = &detach_completion;
- mutex_unlock(&pool->attach_mutex);
+ mutex_unlock(&wq_pool_attach_mutex);
if (pool->detach_completion)
wait_for_completion(pool->detach_completion);
@@ -3656,8 +3714,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
lockdep_assert_held(&wq_pool_mutex);
- ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
- GFP_KERNEL);
+ ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
@@ -4303,7 +4360,6 @@ void set_worker_desc(const char *fmt, ...)
va_start(args, fmt);
vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
va_end(args);
- worker->desc_valid = true;
}
}
@@ -4327,7 +4383,6 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
char desc[WORKER_DESC_LEN] = { };
struct pool_workqueue *pwq = NULL;
struct workqueue_struct *wq = NULL;
- bool desc_valid = false;
struct worker *worker;
if (!(task->flags & PF_WQ_WORKER))
@@ -4340,22 +4395,18 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
worker = kthread_probe_data(task);
/*
- * Carefully copy the associated workqueue's workfn and name. Keep
- * the original last '\0' in case the original contains garbage.
+ * Carefully copy the associated workqueue's workfn, name and desc.
+ * Keep the original last '\0' in case the original is garbage.
*/
probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
probe_kernel_read(name, wq->name, sizeof(name) - 1);
-
- /* copy worker description */
- probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
- if (desc_valid)
- probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+ probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
if (fn || name[0] || desc[0]) {
printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
- if (desc[0])
+ if (strcmp(name, desc))
pr_cont(" (%s)", desc);
pr_cont("\n");
}
@@ -4535,6 +4586,47 @@ void show_workqueue_state(void)
rcu_read_unlock_sched();
}
+/* used to show worker information through /proc/PID/{comm,stat,status} */
+void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
+{
+ int off;
+
+ /* always show the actual comm */
+ off = strscpy(buf, task->comm, size);
+ if (off < 0)
+ return;
+
+ /* stabilize PF_WQ_WORKER and worker pool association */
+ mutex_lock(&wq_pool_attach_mutex);
+
+ if (task->flags & PF_WQ_WORKER) {
+ struct worker *worker = kthread_data(task);
+ struct worker_pool *pool = worker->pool;
+
+ if (pool) {
+ spin_lock_irq(&pool->lock);
+ /*
+ * ->desc tracks information (wq name or
+ * set_worker_desc()) for the latest execution. If
+ * current, prepend '+', otherwise '-'.
+ */
+ if (worker->desc[0] != '\0') {
+ if (worker->current_work)
+ scnprintf(buf + off, size - off, "+%s",
+ worker->desc);
+ else
+ scnprintf(buf + off, size - off, "-%s",
+ worker->desc);
+ }
+ spin_unlock_irq(&pool->lock);
+ }
+ }
+
+ mutex_unlock(&wq_pool_attach_mutex);
+}
+
+#ifdef CONFIG_SMP
+
/*
* CPU hotplug.
*
@@ -4556,7 +4648,7 @@ static void unbind_workers(int cpu)
struct worker *worker;
for_each_cpu_worker_pool(pool, cpu) {
- mutex_lock(&pool->attach_mutex);
+ mutex_lock(&wq_pool_attach_mutex);
spin_lock_irq(&pool->lock);
/*
@@ -4572,7 +4664,7 @@ static void unbind_workers(int cpu)
pool->flags |= POOL_DISASSOCIATED;
spin_unlock_irq(&pool->lock);
- mutex_unlock(&pool->attach_mutex);
+ mutex_unlock(&wq_pool_attach_mutex);
/*
* Call schedule() so that we cross rq->lock and thus can
@@ -4613,7 +4705,7 @@ static void rebind_workers(struct worker_pool *pool)
{
struct worker *worker;
- lockdep_assert_held(&pool->attach_mutex);
+ lockdep_assert_held(&wq_pool_attach_mutex);
/*
* Restore CPU affinity of all workers. As all idle workers should
@@ -4683,7 +4775,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
static cpumask_t cpumask;
struct worker *worker;
- lockdep_assert_held(&pool->attach_mutex);
+ lockdep_assert_held(&wq_pool_attach_mutex);
/* is @cpu allowed for @pool? */
if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4718,14 +4810,14 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_lock(&wq_pool_mutex);
for_each_pool(pool, pi) {
- mutex_lock(&pool->attach_mutex);
+ mutex_lock(&wq_pool_attach_mutex);
if (pool->cpu == cpu)
rebind_workers(pool);
else if (pool->cpu < 0)
restore_unbound_workers_cpumask(pool, cpu);
- mutex_unlock(&pool->attach_mutex);
+ mutex_unlock(&wq_pool_attach_mutex);
}
/* update NUMA affinity of unbound workqueues */
@@ -4755,8 +4847,6 @@ int workqueue_offline_cpu(unsigned int cpu)
return 0;
}
-#ifdef CONFIG_SMP
-
struct work_for_cpu {
struct work_struct work;
long (*fn)(void *);
@@ -5337,7 +5427,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
ret = device_register(&wq_dev->dev);
if (ret) {
- kfree(wq_dev);
+ put_device(&wq_dev->dev);
wq->wq_dev = NULL;
return ret;
}
@@ -5581,12 +5671,13 @@ static void __init wq_numa_init(void)
int __init workqueue_init_early(void)
{
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+ int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
int i, cpu;
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index d390d1be3748..66fbb5a9e633 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -31,13 +31,12 @@ struct worker {
struct work_struct *current_work; /* L: work being processed */
work_func_t current_func; /* L: current_work's fn */
struct pool_workqueue *current_pwq; /* L: current_work's pwq */
- bool desc_valid; /* ->desc is valid */
struct list_head scheduled; /* L: scheduled works */
/* 64 bytes boundary on 64bit, 32 on 32bit */
struct task_struct *task; /* I: worker task */
- struct worker_pool *pool; /* I: the associated pool */
+ struct worker_pool *pool; /* A: the associated pool */
/* L: for rescuers */
struct list_head node; /* A: anchored at pool->workers */
/* A: runs through worker->node */