summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/auditsc.c97
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c29
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c246
-rw-r--r--kernel/bpf/bpf_inode_storage.c42
-rw-r--r--kernel/bpf/bpf_local_storage.c214
-rw-r--r--kernel/bpf/bpf_lsm.c25
-rw-r--r--kernel/bpf/bpf_struct_ops.c3
-rw-r--r--kernel/bpf/bpf_task_storage.c161
-rw-r--r--kernel/bpf/btf.c1578
-rw-r--r--kernel/bpf/cgroup_iter.c16
-rw-r--r--kernel/bpf/core.c67
-rw-r--r--kernel/bpf/cpumap.c35
-rw-r--r--kernel/bpf/cpumask.c479
-rw-r--r--kernel/bpf/devmap.c20
-rw-r--r--kernel/bpf/dispatcher.c5
-rw-r--r--kernel/bpf/hashtab.c45
-rw-r--r--kernel/bpf/helpers.c600
-rw-r--r--kernel/bpf/inode.c8
-rw-r--r--kernel/bpf/local_storage.c2
-rw-r--r--kernel/bpf/map_in_map.c61
-rw-r--r--kernel/bpf/memalloc.c53
-rw-r--r--kernel/bpf/offload.c422
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c6
-rw-r--r--kernel/bpf/preload/iterators/Makefile12
-rw-r--r--kernel/bpf/preload/iterators/README5
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-big-endian.h419
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-little-endian.h (renamed from kernel/bpf/preload/iterators/iterators.lskel.h)0
-rw-r--r--kernel/bpf/ringbuf.c6
-rw-r--r--kernel/bpf/syscall.c587
-rw-r--r--kernel/bpf/task_iter.c39
-rw-r--r--kernel/bpf/trampoline.c87
-rw-r--r--kernel/bpf/verifier.c3720
-rw-r--r--kernel/capability.c10
-rw-r--r--kernel/cgroup/cgroup.c11
-rw-r--r--kernel/cgroup/cpuset.c118
-rw-r--r--kernel/cgroup/rstat.c4
-rw-r--r--kernel/configs/tiny.config5
-rw-r--r--kernel/context_tracking.c12
-rw-r--r--kernel/cpu.c61
-rw-r--r--kernel/cpu_pm.c9
-rw-r--r--kernel/crash_core.c3
-rw-r--r--kernel/cred.c15
-rw-r--r--kernel/dma/mapping.c12
-rw-r--r--kernel/dma/swiotlb.c63
-rw-r--r--kernel/events/core.c2336
-rw-r--r--kernel/exit.c78
-rw-r--r--kernel/fork.c68
-rw-r--r--kernel/futex/core.c26
-rw-r--r--kernel/futex/syscalls.c11
-rw-r--r--kernel/gcov/gcc_4_7.c5
-rwxr-xr-xkernel/gen_kheaders.sh2
-rw-r--r--kernel/irq/Kconfig8
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c405
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/ipi-mux.c206
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/irq/irqdomain.c412
-rw-r--r--kernel/irq/manage.c9
-rw-r--r--kernel/irq/msi.c919
-rw-r--r--kernel/jump_label.c58
-rw-r--r--kernel/kallsyms.c116
-rw-r--r--kernel/kallsyms_internal.h1
-rw-r--r--kernel/kallsyms_selftest.c476
-rw-r--r--kernel/kallsyms_selftest.h13
-rw-r--r--kernel/kcsan/Makefile1
-rw-r--r--kernel/kcsan/kcsan_test.c7
-rw-r--r--kernel/kcsan/report.c3
-rw-r--r--kernel/kcsan/selftest.c12
-rw-r--r--kernel/kexec_core.c13
-rw-r--r--kernel/kexec_file.c2
-rw-r--r--kernel/kprobes.c51
-rw-r--r--kernel/ksysfs.c18
-rw-r--r--kernel/livepatch/core.c29
-rw-r--r--kernel/livepatch/patch.c2
-rw-r--r--kernel/livepatch/transition.c54
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/lockdep.c3
-rw-r--r--kernel/locking/locktorture.c101
-rw-r--r--kernel/locking/qspinlock.c4
-rw-r--r--kernel/locking/rtmutex.c60
-rw-r--r--kernel/locking/rtmutex_api.c6
-rw-r--r--kernel/locking/rwsem.c87
-rw-r--r--kernel/locking/test-ww_mutex.c4
-rw-r--r--kernel/module/Kconfig3
-rw-r--r--kernel/module/decompress.c100
-rw-r--r--kernel/module/kallsyms.c15
-rw-r--r--kernel/module/main.c39
-rw-r--r--kernel/module/sysfs.c2
-rw-r--r--kernel/notifier.c9
-rw-r--r--kernel/padata.c27
-rw-r--r--kernel/panic.c58
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/pid_namespace.c17
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/energy_model.c5
-rw-r--r--kernel/power/process.c25
-rw-r--r--kernel/power/snapshot.c12
-rw-r--r--kernel/power/swap.c16
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcu/Kconfig.debug15
-rw-r--r--kernel/rcu/rcu.h8
-rw-r--r--kernel/rcu/rcu_segcblist.c2
-rw-r--r--kernel/rcu/rcu_segcblist.h2
-rw-r--r--kernel/rcu/rcutorture.c12
-rw-r--r--kernel/rcu/refscale.c250
-rw-r--r--kernel/rcu/srcutree.c98
-rw-r--r--kernel/rcu/tasks.h87
-rw-r--r--kernel/rcu/tiny.c9
-rw-r--r--kernel/rcu/tree.c667
-rw-r--r--kernel/rcu/tree.h19
-rw-r--r--kernel/rcu/tree_exp.h43
-rw-r--r--kernel/rcu/tree_stall.h37
-rw-r--r--kernel/rcu/update.c49
-rw-r--r--kernel/relay.c8
-rw-r--r--kernel/resource.c30
-rw-r--r--kernel/rseq.c65
-rw-r--r--kernel/sched/clock.c27
-rw-r--r--kernel/sched/core.c484
-rw-r--r--kernel/sched/cpufreq_schedutil.c43
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c49
-rw-r--r--kernel/sched/fair.c542
-rw-r--r--kernel/sched/idle.c47
-rw-r--r--kernel/sched/membarrier.c39
-rw-r--r--kernel/sched/psi.c107
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h161
-rw-r--r--kernel/sched/stats.h22
-rw-r--r--kernel/sched/topology.c99
-rw-r--r--kernel/sched/wait.c18
-rw-r--r--kernel/scs.c14
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c5
-rw-r--r--kernel/static_call_inline.c23
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c20
-rw-r--r--kernel/time/Kconfig6
-rw-r--r--kernel/time/alarmtimer.c33
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c74
-rw-r--r--kernel/time/hrtimer.c18
-rw-r--r--kernel/time/namespace.c18
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/posix-stubs.c2
-rw-r--r--kernel/time/posix-timers.c2
-rw-r--r--kernel/time/test_udelay.c2
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c29
-rw-r--r--kernel/time/tick-broadcast.c6
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/time.c8
-rw-r--r--kernel/time/timekeeping.c8
-rw-r--r--kernel/time/timer.c427
-rw-r--r--kernel/torture.c4
-rw-r--r--kernel/trace/Kconfig50
-rw-r--r--kernel/trace/blktrace.c11
-rw-r--r--kernel/trace/bpf_trace.c190
-rw-r--r--kernel/trace/ftrace.c179
-rw-r--r--kernel/trace/kprobe_event_gen_test.c2
-rw-r--r--kernel/trace/ring_buffer.c95
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h2
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h2
-rw-r--r--kernel/trace/rv/rv.c2
-rw-r--r--kernel/trace/synth_event_gen_test.c2
-rw-r--r--kernel/trace/trace.c282
-rw-r--r--kernel/trace/trace.h39
-rw-r--r--kernel/trace/trace_eprobe.c12
-rw-r--r--kernel/trace/trace_event_perf.c16
-rw-r--r--kernel/trace/trace_events.c116
-rw-r--r--kernel/trace/trace_events_filter.c101
-rw-r--r--kernel/trace/trace_events_hist.c316
-rw-r--r--kernel/trace/trace_events_synth.c92
-rw-r--r--kernel/trace/trace_events_trigger.c19
-rw-r--r--kernel/trace/trace_events_user.c3
-rw-r--r--kernel/trace/trace_export.c3
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_osnoise.c251
-rw-r--r--kernel/trace/trace_output.c74
-rw-r--r--kernel/trace/trace_preemptirq.c61
-rw-r--r--kernel/trace/trace_probe.c69
-rw-r--r--kernel/trace/trace_probe.h20
-rw-r--r--kernel/trace/trace_probe_tmpl.h47
-rw-r--r--kernel/trace/trace_selftest.c9
-rw-r--r--kernel/trace/trace_seq.c23
-rw-r--r--kernel/trace/trace_synth.h1
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--kernel/umh.c20
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/workqueue.c280
196 files changed, 15704 insertions, 5885 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d754e0be1176..10ef068f598d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -41,9 +41,6 @@ UBSAN_SANITIZE_kcov.o := n
KMSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
-# Don't instrument error handlers
-CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI)
-
obj-y += sched/
obj-y += locking/
obj-y += power/
@@ -69,6 +66,7 @@ endif
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 62200d799b9b..010667ce6080 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -320,7 +320,7 @@ void acct_exit_ns(struct pid_namespace *ns)
}
/*
- * encode an unsigned long into a comp_t
+ * encode an u64 into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
@@ -331,7 +331,7 @@ void acct_exit_ns(struct pid_namespace *ns)
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
-static comp_t encode_comp_t(unsigned long value)
+static comp_t encode_comp_t(u64 value)
{
int exp, rnd;
@@ -350,6 +350,8 @@ static comp_t encode_comp_t(unsigned long value)
exp++;
}
+ if (exp > (((comp_t) ~0U) >> MANTSIZE))
+ return (comp_t) ~0U;
/*
* Clean it up and polish it off.
*/
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9f8c05228d6d..93d0b87f3283 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
#include <uapi/linux/limits.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/openat2.h> // struct open_how
+#include <uapi/linux/fanotify.h>
#include "audit.h"
@@ -806,30 +807,53 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
}
/**
- * audit_filter_uring - apply filters to an io_uring operation
+ * __audit_filter_op - common filter helper for operations (syscall/uring/etc)
* @tsk: associated task
* @ctx: audit context
+ * @list: audit filter list
+ * @name: audit_name (can be NULL)
+ * @op: current syscall/uring_op
+ *
+ * Run the udit filters specified in @list against @tsk using @ctx,
+ * @name, and @op, as necessary; the caller is responsible for ensuring
+ * that the call is made while the RCU read lock is held. The @name
+ * parameter can be NULL, but all others must be specified.
+ * Returns 1/true if the filter finds a match, 0/false if none are found.
*/
-static void audit_filter_uring(struct task_struct *tsk,
- struct audit_context *ctx)
+static int __audit_filter_op(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list,
+ struct audit_names *name,
+ unsigned long op)
{
struct audit_entry *e;
enum audit_state state;
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, op) &&
+ audit_filter_rules(tsk, &e->rule, ctx, name,
+ &state, false)) {
+ ctx->current_state = state;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/**
+ * audit_filter_uring - apply filters to an io_uring operation
+ * @tsk: associated task
+ * @ctx: audit context
+ */
+static void audit_filter_uring(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
if (auditd_test_task(tsk))
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
- list) {
- if (audit_in_mask(&e->rule, ctx->uring_op) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL, &state,
- false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
+ NULL, ctx->uring_op);
rcu_read_unlock();
}
@@ -841,24 +865,13 @@ static void audit_filter_uring(struct task_struct *tsk,
static void audit_filter_syscall(struct task_struct *tsk,
struct audit_context *ctx)
{
- struct audit_entry *e;
- enum audit_state state;
-
if (auditd_test_task(tsk))
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state, false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT],
+ NULL, ctx->major);
rcu_read_unlock();
- return;
}
/*
@@ -870,17 +883,8 @@ static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_context *ctx) {
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
- struct audit_entry *e;
- enum audit_state state;
- list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
- ctx->current_state = state;
- return 1;
- }
- }
- return 0;
+ return __audit_filter_op(tsk, ctx, list, n, ctx->major);
}
/* At syscall exit time, this filter is called if any audit_names have been
@@ -2249,7 +2253,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
+ rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps);
if (rc)
return rc;
@@ -2804,7 +2808,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(&init_user_ns,
+ get_vfs_caps_from_disk(&nop_mnt_idmap,
bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
@@ -2874,10 +2878,21 @@ void __audit_log_kern_module(char *name)
context->type = AUDIT_KERN_MODULE;
}
-void __audit_fanotify(unsigned int response)
+void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
- audit_log(audit_context(), GFP_KERNEL,
- AUDIT_FANOTIFY, "resp=%u", response);
+ /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */
+ switch (friar->hdr.type) {
+ case FAN_RESPONSE_INFO_NONE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2",
+ response, FAN_RESPONSE_INFO_NONE);
+ break;
+ case FAN_RESPONSE_INFO_AUDIT_RULE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u",
+ response, friar->hdr.type, friar->rule_number,
+ friar->subj_trust, friar->obj_trust);
+ }
}
void __audit_tk_injoffset(struct timespec64 offset)
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 341c94f208f4..02242614dcc7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -25,7 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
ifeq ($(CONFIG_CGROUPS),y)
-obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o
endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
@@ -36,6 +36,7 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 832b2659e96e..484706959556 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -306,14 +306,6 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return 0;
}
-static void check_and_free_fields(struct bpf_array *arr, void *val)
-{
- if (map_value_has_timer(&arr->map))
- bpf_timer_cancel_and_free(val + arr->map.timer_off);
- if (map_value_has_kptrs(&arr->map))
- bpf_map_free_kptrs(&arr->map, val);
-}
-
/* Called from syscall or from eBPF program */
static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
@@ -335,13 +327,13 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
copy_map_value(map, val, value);
- check_and_free_fields(array, val);
+ bpf_obj_free_fields(array->map.record, val);
} else {
val = array->value +
(u64)array->elem_size * (index & array->index_mask);
@@ -349,7 +341,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
- check_and_free_fields(array, val);
+ bpf_obj_free_fields(array->map.record, val);
}
return 0;
}
@@ -386,7 +378,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
- check_and_free_fields(array, per_cpu_ptr(pptr, cpu));
+ bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
off += size;
}
rcu_read_unlock();
@@ -409,12 +401,12 @@ static void array_map_free_timers(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- /* We don't reset or free kptr on uref dropping to zero. */
- if (!map_value_has_timer(map))
+ /* We don't reset or free fields other than timer on uref dropping to zero. */
+ if (!btf_record_has_field(map->record, BPF_TIMER))
return;
for (i = 0; i < array->map.max_entries; i++)
- bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off);
+ bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -423,22 +415,21 @@ static void array_map_free(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- if (map_value_has_kptrs(map)) {
+ if (!IS_ERR_OR_NULL(map->record)) {
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
for (i = 0; i < array->map.max_entries; i++) {
void __percpu *pptr = array->pptrs[i & array->index_mask];
int cpu;
for_each_possible_cpu(cpu) {
- bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu));
+ bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu));
cond_resched();
}
}
} else {
for (i = 0; i < array->map.max_entries; i++)
- bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
+ bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i));
}
- bpf_map_free_kptr_off_tab(map);
}
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
new file mode 100644
index 000000000000..6cdf6d9ed91d
--- /dev/null
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+
+DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
+
+static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
+
+static void bpf_cgrp_storage_lock(void)
+{
+ migrate_disable();
+ this_cpu_inc(bpf_cgrp_storage_busy);
+}
+
+static void bpf_cgrp_storage_unlock(void)
+{
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ migrate_enable();
+}
+
+static bool bpf_cgrp_storage_trylock(void)
+{
+ migrate_disable();
+ if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ migrate_enable();
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
+{
+ struct cgroup *cg = owner;
+
+ return &cg->bpf_cgrp_storage;
+}
+
+void bpf_cgrp_storage_free(struct cgroup *cgroup)
+{
+ struct bpf_local_storage *local_storage;
+ bool free_cgroup_storage = false;
+ unsigned long flags;
+
+ rcu_read_lock();
+ local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
+ if (!local_storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ bpf_cgrp_storage_lock();
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_cgrp_storage_unlock();
+ rcu_read_unlock();
+
+ if (free_cgroup_storage)
+ kfree_rcu(local_storage, rcu);
+}
+
+static struct bpf_local_storage_data *
+cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit)
+{
+ struct bpf_local_storage *cgroup_storage;
+ struct bpf_local_storage_map *smap;
+
+ cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage,
+ bpf_rcu_lock_held());
+ if (!cgroup_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit);
+}
+
+static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return ERR_CAST(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return sdata ? sdata->data : NULL;
+}
+
+static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, map_flags, GFP_ATOMIC);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = cgroup_storage_lookup(cgroup, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), true);
+ return 0;
+}
+
+static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct cgroup *cgroup;
+ int err, fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ err = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return err;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache);
+}
+
+static void cgroup_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &cgroup_cache, NULL);
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ if (!cgroup)
+ return (unsigned long)NULL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return (unsigned long)NULL;
+
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ if (sdata)
+ goto unlock;
+
+ /* only allocate new storage, when the cgroup is refcounted */
+ if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, BPF_NOEXIST, gfp_flags);
+
+unlock:
+ bpf_cgrp_storage_unlock();
+ return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
+}
+
+BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!cgroup)
+ return -EINVAL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return -EBUSY;
+
+ ret = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ return ret;
+}
+
+const struct bpf_map_ops cgrp_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = cgroup_storage_map_alloc,
+ .map_free = cgroup_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_cgrp_storage_lookup_elem,
+ .map_update_elem = bpf_cgrp_storage_update_elem,
+ .map_delete_elem = bpf_cgrp_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = cgroup_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
+ .func = bpf_cgrp_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
+ .func = bpf_cgrp_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 5f7683b19199..05f4c66c9089 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -56,11 +56,9 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
void bpf_inode_storage_free(struct inode *inode)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
bool free_inode_storage = false;
struct bpf_storage_blob *bsb;
- struct hlist_node *n;
bsb = bpf_inode(inode);
if (!bsb)
@@ -74,30 +72,11 @@ void bpf_inode_storage_free(struct inode *inode)
return;
}
- /* Neither the bpf_prog nor the bpf-map's syscall
- * could be modifying the local_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * local_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the local_storage->list and
- * the map's bucket->list.
- */
raw_spin_lock_bh(&local_storage->lock);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- free_inode_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false, false);
- }
+ free_inode_storage = bpf_local_storage_unlink_nolock(local_storage);
raw_spin_unlock_bh(&local_storage->lock);
rcu_read_unlock();
- /* free_inoode_storage should always be true as long as
- * local_storage->list was non-empty.
- */
if (free_inode_storage)
kfree_rcu(local_storage, rcu);
}
@@ -226,27 +205,14 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &inode_cache);
}
static void inode_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, NULL);
+ bpf_local_storage_map_free(map, &inode_cache, NULL);
}
-BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct,
- bpf_local_storage_map)
const struct bpf_map_ops inode_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -257,7 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
.map_update_elem = bpf_fd_inode_storage_update_elem,
.map_delete_elem = bpf_fd_inode_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_id = &inode_storage_map_btf_ids[0],
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = inode_storage_ptr,
};
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index f27fa5ba7d72..35f4138a54dc 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -88,8 +88,14 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
+ /* If RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree(), else do kfree_rcu().
+ */
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- kfree_rcu(local_storage, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(local_storage);
+ else
+ kfree_rcu(local_storage, rcu);
}
static void bpf_selem_free_rcu(struct rcu_head *rcu)
@@ -97,16 +103,19 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
struct bpf_local_storage_elem *selem;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- kfree_rcu(selem, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(selem);
+ else
+ kfree_rcu(selem, rcu);
}
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
*/
-bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_elem *selem,
- bool uncharge_mem, bool use_trace_rcu)
+static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ bool uncharge_mem, bool use_trace_rcu)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -233,6 +242,7 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
__bpf_selem_unlink_storage(selem, use_trace_rcu);
}
+/* If cacheit_lockit is false, this lookup function is lockless */
struct bpf_local_storage_data *
bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *smap,
@@ -372,7 +382,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
/* BPF_F_LOCK can only be used in a value with spin_lock */
unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(&smap->map)))
+ !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
return ERR_PTR(-EINVAL);
if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
@@ -491,7 +501,7 @@ unlock_err:
return ERR_PTR(err);
}
-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
{
u64 min_usage = U64_MAX;
u16 i, res = 0;
@@ -515,76 +525,14 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
return res;
}
-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
- u16 idx)
+static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+ u16 idx)
{
spin_lock(&cache->idx_lock);
cache->idx_usage_counts[idx]--;
spin_unlock(&cache->idx_lock);
}
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
- int __percpu *busy_counter)
-{
- struct bpf_local_storage_elem *selem;
- struct bpf_local_storage_map_bucket *b;
- unsigned int i;
-
- /* Note that this map might be concurrently cloned from
- * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
- * RCU read section to finish before proceeding. New RCU
- * read sections should be prevented via bpf_map_inc_not_zero.
- */
- synchronize_rcu();
-
- /* bpf prog and the userspace can no longer access this map
- * now. No new selem (of this map) can be added
- * to the owner->storage or to the map bucket's list.
- *
- * The elem of this map can be cleaned up here
- * or when the storage is freed e.g.
- * by bpf_sk_storage_free() during __sk_destruct().
- */
- for (i = 0; i < (1U << smap->bucket_log); i++) {
- b = &smap->buckets[i];
-
- rcu_read_lock();
- /* No one is adding to b->list now */
- while ((selem = hlist_entry_safe(
- rcu_dereference_raw(hlist_first_rcu(&b->list)),
- struct bpf_local_storage_elem, map_node))) {
- if (busy_counter) {
- migrate_disable();
- this_cpu_inc(*busy_counter);
- }
- bpf_selem_unlink(selem, false);
- if (busy_counter) {
- this_cpu_dec(*busy_counter);
- migrate_enable();
- }
- cond_resched_rcu();
- }
- rcu_read_unlock();
- }
-
- /* While freeing the storage we may still need to access the map.
- *
- * e.g. when bpf_sk_storage_free() has unlinked selem from the map
- * which then made the above while((selem = ...)) loop
- * exit immediately.
- *
- * However, while freeing the storage one still needs to access the
- * smap->elem_size to do the uncharging in
- * bpf_selem_unlink_storage_nolock().
- *
- * Hence, wait another rcu grace period for the storage to be freed.
- */
- synchronize_rcu();
-
- kvfree(smap->buckets);
- bpf_map_area_free(smap);
-}
-
int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
{
if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
@@ -604,7 +552,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
+static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr)
{
struct bpf_local_storage_map *smap;
unsigned int i;
@@ -620,8 +568,8 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);
- smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
- GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
+ nbuckets, GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
bpf_map_area_free(smap);
return ERR_PTR(-ENOMEM);
@@ -632,8 +580,8 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&smap->buckets[i].lock);
}
- smap->elem_size =
- sizeof(struct bpf_local_storage_elem) + attr->value_size;
+ smap->elem_size = offsetof(struct bpf_local_storage_elem,
+ sdata.data[attr->value_size]);
return smap;
}
@@ -654,3 +602,117 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
return 0;
}
+
+bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
+{
+ struct bpf_local_storage_elem *selem;
+ bool free_storage = false;
+ struct hlist_node *n;
+
+ /* Neither the bpf_prog nor the bpf_map's syscall
+ * could be modifying the local_storage->list now.
+ * Thus, no elem can be added to or deleted from the
+ * local_storage->list by the bpf_prog or by the bpf_map's syscall.
+ *
+ * It is racing with bpf_local_storage_map_free() alone
+ * when unlinking elem from the local_storage->list and
+ * the map's bucket->list.
+ */
+ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+ /* Always unlink from map before unlinking from
+ * local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ /* If local_storage list has only one element, the
+ * bpf_selem_unlink_storage_nolock() will return true.
+ * Otherwise, it will return false. The current loop iteration
+ * intends to remove all local storage. So the last iteration
+ * of the loop will set the free_cgroup_storage to true.
+ */
+ free_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, false, false);
+ }
+
+ return free_storage;
+}
+
+struct bpf_map *
+bpf_local_storage_map_alloc(union bpf_attr *attr,
+ struct bpf_local_storage_cache *cache)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = __bpf_local_storage_map_alloc(attr);
+ if (IS_ERR(smap))
+ return ERR_CAST(smap);
+
+ smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
+ return &smap->map;
+}
+
+void bpf_local_storage_map_free(struct bpf_map *map,
+ struct bpf_local_storage_cache *cache,
+ int __percpu *busy_counter)
+{
+ struct bpf_local_storage_map_bucket *b;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
+
+ /* Note that this map might be concurrently cloned from
+ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+ * RCU read section to finish before proceeding. New RCU
+ * read sections should be prevented via bpf_map_inc_not_zero.
+ */
+ synchronize_rcu();
+
+ /* bpf prog and the userspace can no longer access this map
+ * now. No new selem (of this map) can be added
+ * to the owner->storage or to the map bucket's list.
+ *
+ * The elem of this map can be cleaned up here
+ * or when the storage is freed e.g.
+ * by bpf_sk_storage_free() during __sk_destruct().
+ */
+ for (i = 0; i < (1U << smap->bucket_log); i++) {
+ b = &smap->buckets[i];
+
+ rcu_read_lock();
+ /* No one is adding to b->list now */
+ while ((selem = hlist_entry_safe(
+ rcu_dereference_raw(hlist_first_rcu(&b->list)),
+ struct bpf_local_storage_elem, map_node))) {
+ if (busy_counter) {
+ migrate_disable();
+ this_cpu_inc(*busy_counter);
+ }
+ bpf_selem_unlink(selem, false);
+ if (busy_counter) {
+ this_cpu_dec(*busy_counter);
+ migrate_enable();
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+ }
+
+ /* While freeing the storage we may still need to access the map.
+ *
+ * e.g. when bpf_sk_storage_free() has unlinked selem from the map
+ * which then made the above while((selem = ...)) loop
+ * exit immediately.
+ *
+ * However, while freeing the storage one still needs to access the
+ * smap->elem_size to do the uncharging in
+ * bpf_selem_unlink_storage_nolock().
+ *
+ * Hence, wait another rcu grace period for the storage to be freed.
+ */
+ synchronize_rcu();
+
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index d6c9b3705f24..e14c822f8911 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -51,7 +51,6 @@ BTF_SET_END(bpf_lsm_current_hooks)
*/
BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
#ifdef CONFIG_SECURITY_NETWORK
-BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)
BTF_ID(func, bpf_lsm_sock_graft)
BTF_ID(func, bpf_lsm_inet_csk_clone)
BTF_ID(func, bpf_lsm_inet_conn_established)
@@ -151,6 +150,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.func = bpf_ima_inode_hash,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
@@ -169,6 +169,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
static const struct bpf_func_proto bpf_ima_file_hash_proto = {
.func = bpf_ima_file_hash,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_ima_file_hash_btf_ids[0],
@@ -221,9 +222,9 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_bprm_opts_set:
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
- return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+ return &bpf_ima_inode_hash_proto;
case BPF_FUNC_ima_file_hash:
- return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
+ return &bpf_ima_file_hash_proto;
case BPF_FUNC_get_attach_cookie:
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
#ifdef CONFIG_NET
@@ -343,11 +344,29 @@ BTF_ID(func, bpf_lsm_task_to_inode)
BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)
+BTF_SET_START(untrusted_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_ID(func, bpf_lsm_file_free_security)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+#endif /* CONFIG_SECURITY_NETWORK */
+BTF_ID(func, bpf_lsm_task_free)
+BTF_SET_END(untrusted_lsm_hooks)
+
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
{
return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
}
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+ return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
+}
+
const struct bpf_prog_ops lsm_prog_ops = {
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 84b2d9dba79a..ece9870cab68 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -494,8 +494,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
refcount_set(&kvalue->refcnt, 1);
bpf_map_inc(map);
- set_memory_ro((long)st_map->image, 1);
- set_memory_x((long)st_map->image, 1);
+ set_memory_rox((long)st_map->image, 1);
err = st_ops->reg(kdata);
if (likely(!err)) {
/* Pair with smp_load_acquire() during lookup_elem().
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 6f290623347e..1e486055a523 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -71,10 +71,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
void bpf_task_storage_free(struct task_struct *task)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
bool free_task_storage = false;
- struct hlist_node *n;
unsigned long flags;
rcu_read_lock();
@@ -85,32 +83,13 @@ void bpf_task_storage_free(struct task_struct *task)
return;
}
- /* Neither the bpf_prog nor the bpf-map's syscall
- * could be modifying the local_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * local_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the local_storage->list and
- * the map's bucket->list.
- */
bpf_task_storage_lock();
raw_spin_lock_irqsave(&local_storage->lock, flags);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- free_task_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false, false);
- }
+ free_task_storage = bpf_local_storage_unlink_nolock(local_storage);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_task_storage_unlock();
rcu_read_unlock();
- /* free_task_storage should always be true as long as
- * local_storage->list was non-empty.
- */
if (free_task_storage)
kfree_rcu(local_storage, rcu);
}
@@ -184,7 +163,8 @@ out:
return err;
}
-static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
+ bool nobusy)
{
struct bpf_local_storage_data *sdata;
@@ -192,6 +172,9 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
if (!sdata)
return -ENOENT;
+ if (!nobusy)
+ return -EBUSY;
+
bpf_selem_unlink(SELEM(sdata), true);
return 0;
@@ -220,63 +203,108 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
}
bpf_task_storage_lock();
- err = task_storage_delete(task, map);
+ err = task_storage_delete(task, map, true);
bpf_task_storage_unlock();
out:
put_pid(pid);
return err;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
+/* Called by bpf_task_storage_get*() helpers */
+static void *__bpf_task_storage_get(struct bpf_map *map,
+ struct task_struct *task, void *value,
+ u64 flags, gfp_t gfp_flags, bool nobusy)
{
struct bpf_local_storage_data *sdata;
- WARN_ON_ONCE(!bpf_rcu_lock_held());
- if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
- return (unsigned long)NULL;
-
- if (!task)
- return (unsigned long)NULL;
-
- if (!bpf_task_storage_trylock())
- return (unsigned long)NULL;
-
- sdata = task_storage_lookup(task, map, true);
+ sdata = task_storage_lookup(task, map, nobusy);
if (sdata)
- goto unlock;
+ return sdata->data;
/* only allocate new storage, when the task is refcounted */
if (refcount_read(&task->usage) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST, gfp_flags);
+ return IS_ERR(sdata) ? NULL : sdata->data;
+ }
+
+ return NULL;
+}
-unlock:
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ bool nobusy;
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ nobusy = bpf_task_storage_trylock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return (unsigned long)data;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ bpf_task_storage_lock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, true);
bpf_task_storage_unlock();
- return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
- (unsigned long)sdata->data;
+ return (unsigned long)data;
}
-BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
task)
{
+ bool nobusy;
int ret;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
- if (!bpf_task_storage_trylock())
- return -EBUSY;
+ nobusy = bpf_task_storage_trylock();
+ /* This helper must only be called from places where the lifetime of the task
+ * is guaranteed. Either by being refcounted or by being protected
+ * by an RCU read-side critical section.
+ */
+ ret = task_storage_delete(task, map, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return ret;
+}
+
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+ task)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!task)
+ return -EINVAL;
+ bpf_task_storage_lock();
/* This helper must only be called from places where the lifetime of the task
* is guaranteed. Either by being refcounted or by being protected
* by an RCU read-side critical section.
*/
- ret = task_storage_delete(task, map);
+ ret = task_storage_delete(task, map, true);
bpf_task_storage_unlock();
return ret;
}
@@ -288,26 +316,15 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &task_cache);
}
static void task_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
+ bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
}
-BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map)
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
const struct bpf_map_ops task_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -318,10 +335,21 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_update_elem = bpf_pid_task_storage_update_elem,
.map_delete_elem = bpf_pid_task_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_id = &task_storage_map_btf_ids[0],
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = task_storage_ptr,
};
+const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
+ .func = bpf_task_storage_get_recur,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
const struct bpf_func_proto bpf_task_storage_get_proto = {
.func = bpf_task_storage_get,
.gpl_only = false,
@@ -333,6 +361,15 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.arg4_type = ARG_ANYTHING,
};
+const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
+ .func = bpf_task_storage_delete_recur,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
+
const struct bpf_func_proto bpf_task_storage_delete_proto = {
.func = bpf_task_storage_delete,
.gpl_only = false,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 35c07afac924..fa22ec79ac0e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -19,6 +19,7 @@
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_lsm.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
#include <linux/bsearch.h>
@@ -199,11 +200,13 @@ DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
enum btf_kfunc_hook {
+ BTF_KFUNC_HOOK_COMMON,
BTF_KFUNC_HOOK_XDP,
BTF_KFUNC_HOOK_TC,
BTF_KFUNC_HOOK_STRUCT_OPS,
BTF_KFUNC_HOOK_TRACING,
BTF_KFUNC_HOOK_SYSCALL,
+ BTF_KFUNC_HOOK_FMODRET,
BTF_KFUNC_HOOK_MAX,
};
@@ -237,6 +240,7 @@ struct btf {
struct rcu_head rcu;
struct btf_kfunc_set_tab *kfunc_set_tab;
struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
+ struct btf_struct_metas *struct_meta_tab;
/* split BTF support */
struct btf *base_btf;
@@ -333,6 +337,12 @@ const char *btf_type_str(const struct btf_type *t)
#define BTF_SHOW_NAME_SIZE 80
/*
+ * The suffix of a type that indicates it cannot alias another type when
+ * comparing BTF IDs for kfunc invocations.
+ */
+#define NOCAST_ALIAS_SUFFIX "___init"
+
+/*
* Common data to all BTF show operations. Private show functions can add
* their own data to a structure containing a struct btf_show and consult it
* in the show callback. See btf_type_show() below.
@@ -477,16 +487,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
return !t || btf_type_nosize(t);
}
-static bool __btf_type_is_struct(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
-}
-
-static bool btf_type_is_array(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
-}
-
static bool btf_type_is_datasec(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
@@ -1403,12 +1403,18 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
- /* btf verifier prints all types it is processing via
- * btf_verifier_log_type(..., fmt = NULL).
- * Skip those prints for in-kernel BTF verification.
- */
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ /* btf verifier prints all types it is processing via
+ * btf_verifier_log_type(..., fmt = NULL).
+ * Skip those prints for in-kernel BTF verification.
+ */
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
@@ -1447,8 +1453,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
- if (log->level == BPF_LOG_KERNEL && !fmt)
- return;
+ if (log->level == BPF_LOG_KERNEL) {
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
+
/* The CHECK_META phase already did a btf dump.
*
* If member is logged again, it must hit an error in
@@ -1642,8 +1655,30 @@ static void btf_free_dtor_kfunc_tab(struct btf *btf)
btf->dtor_kfunc_tab = NULL;
}
+static void btf_struct_metas_free(struct btf_struct_metas *tab)
+{
+ int i;
+
+ if (!tab)
+ return;
+ for (i = 0; i < tab->cnt; i++) {
+ btf_record_free(tab->types[i].record);
+ kfree(tab->types[i].field_offs);
+ }
+ kfree(tab);
+}
+
+static void btf_free_struct_meta_tab(struct btf *btf)
+{
+ struct btf_struct_metas *tab = btf->struct_meta_tab;
+
+ btf_struct_metas_free(tab);
+ btf->struct_meta_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
+ btf_free_struct_meta_tab(btf);
btf_free_dtor_kfunc_tab(btf);
btf_free_kfunc_set_tab(btf);
kvfree(btf->types);
@@ -3191,7 +3226,7 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-enum btf_field_type {
+enum btf_field_info_type {
BTF_FIELD_SPIN_LOCK,
BTF_FIELD_TIMER,
BTF_FIELD_KPTR,
@@ -3203,18 +3238,28 @@ enum {
};
struct btf_field_info {
- u32 type_id;
+ enum btf_field_type type;
u32 off;
- enum bpf_kptr_type type;
+ union {
+ struct {
+ u32 type_id;
+ } kptr;
+ struct {
+ const char *node_name;
+ u32 value_btf_id;
+ } graph_root;
+ };
};
static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
- u32 off, int sz, struct btf_field_info *info)
+ u32 off, int sz, enum btf_field_type field_type,
+ struct btf_field_info *info)
{
if (!__btf_type_is_struct(t))
return BTF_FIELD_IGNORE;
if (t->size != sz)
return BTF_FIELD_IGNORE;
+ info->type = field_type;
info->off = off;
return BTF_FIELD_FOUND;
}
@@ -3222,9 +3267,12 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
u32 off, int sz, struct btf_field_info *info)
{
- enum bpf_kptr_type type;
+ enum btf_field_type type;
u32 res_id;
+ /* Permit modifiers on the pointer itself */
+ if (btf_type_is_volatile(t))
+ t = btf_type_by_id(btf, t->type);
/* For PTR, sz is always == 8 */
if (!btf_type_is_ptr(t))
return BTF_FIELD_IGNORE;
@@ -3248,28 +3296,138 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
if (!__btf_type_is_struct(t))
return -EINVAL;
- info->type_id = res_id;
- info->off = off;
info->type = type;
+ info->off = off;
+ info->kptr.type_id = res_id;
return BTF_FIELD_FOUND;
}
-static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align,
- enum btf_field_type field_type,
+static const char *btf_find_decl_tag_value(const struct btf *btf,
+ const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
+{
+ int i;
+
+ for (i = 1; i < btf_nr_types(btf); i++) {
+ const struct btf_type *t = btf_type_by_id(btf, i);
+ int len = strlen(tag_key);
+
+ if (!btf_type_is_decl_tag(t))
+ continue;
+ if (pt != btf_type_by_id(btf, t->type) ||
+ btf_type_decl_tag(t)->component_idx != comp_idx)
+ continue;
+ if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
+ continue;
+ return __btf_name_by_offset(btf, t->name_off) + len;
+ }
+ return NULL;
+}
+
+static int
+btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
+ const struct btf_type *t, int comp_idx, u32 off,
+ int sz, struct btf_field_info *info,
+ enum btf_field_type head_type)
+{
+ const char *node_field_name;
+ const char *value_type;
+ s32 id;
+
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
+ if (!value_type)
+ return -EINVAL;
+ node_field_name = strstr(value_type, ":");
+ if (!node_field_name)
+ return -EINVAL;
+ value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN);
+ if (!value_type)
+ return -ENOMEM;
+ id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
+ kfree(value_type);
+ if (id < 0)
+ return id;
+ node_field_name++;
+ if (str_is_empty(node_field_name))
+ return -EINVAL;
+ info->type = head_type;
+ info->off = off;
+ info->graph_root.value_btf_id = id;
+ info->graph_root.node_name = node_field_name;
+ return BTF_FIELD_FOUND;
+}
+
+#define field_mask_test_name(field_type, field_type_str) \
+ if (field_mask & field_type && !strcmp(name, field_type_str)) { \
+ type = field_type; \
+ goto end; \
+ }
+
+static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
+ int *align, int *sz)
+{
+ int type = 0;
+
+ if (field_mask & BPF_SPIN_LOCK) {
+ if (!strcmp(name, "bpf_spin_lock")) {
+ if (*seen_mask & BPF_SPIN_LOCK)
+ return -E2BIG;
+ *seen_mask |= BPF_SPIN_LOCK;
+ type = BPF_SPIN_LOCK;
+ goto end;
+ }
+ }
+ if (field_mask & BPF_TIMER) {
+ if (!strcmp(name, "bpf_timer")) {
+ if (*seen_mask & BPF_TIMER)
+ return -E2BIG;
+ *seen_mask |= BPF_TIMER;
+ type = BPF_TIMER;
+ goto end;
+ }
+ }
+ field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
+ field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
+ field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
+ field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
+
+ /* Only return BPF_KPTR when all other types with matchable names fail */
+ if (field_mask & BPF_KPTR) {
+ type = BPF_KPTR_REF;
+ goto end;
+ }
+ return 0;
+end:
+ *sz = btf_field_type_size(type);
+ *align = btf_field_type_align(type);
+ return type;
+}
+
+#undef field_mask_test_name
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
struct btf_field_info *info, int info_cnt)
{
+ int ret, idx = 0, align, sz, field_type;
const struct btf_member *member;
struct btf_field_info tmp;
- int ret, idx = 0;
- u32 i, off;
+ u32 i, off, seen_mask = 0;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
+ field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off),
+ field_mask, &seen_mask, &align, &sz);
+ if (field_type == 0)
continue;
+ if (field_type < 0)
+ return field_type;
off = __btf_member_bit_offset(t, member);
if (off % 8)
@@ -3277,22 +3435,34 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
return -EINVAL;
off /= 8;
if (off % align)
- return -EINVAL;
+ continue;
switch (field_type) {
- case BTF_FIELD_SPIN_LOCK:
- case BTF_FIELD_TIMER:
- ret = btf_find_struct(btf, member_type, off, sz,
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ ret = btf_find_struct(btf, member_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
- case BTF_FIELD_KPTR:
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
ret = btf_find_kptr(btf, member_type, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, t, member_type,
+ i, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
default:
return -EFAULT;
}
@@ -3307,42 +3477,57 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align,
- enum btf_field_type field_type,
- struct btf_field_info *info, int info_cnt)
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
{
+ int ret, idx = 0, align, sz, field_type;
const struct btf_var_secinfo *vsi;
struct btf_field_info tmp;
- int ret, idx = 0;
- u32 i, off;
+ u32 i, off, seen_mask = 0;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
- off = vsi->offset;
-
- if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+ field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off),
+ field_mask, &seen_mask, &align, &sz);
+ if (field_type == 0)
continue;
+ if (field_type < 0)
+ return field_type;
+
+ off = vsi->offset;
if (vsi->size != sz)
continue;
if (off % align)
- return -EINVAL;
+ continue;
switch (field_type) {
- case BTF_FIELD_SPIN_LOCK:
- case BTF_FIELD_TIMER:
- ret = btf_find_struct(btf, var_type, off, sz,
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
- case BTF_FIELD_KPTR:
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
ret = btf_find_kptr(btf, var_type, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, var, var_type,
+ -1, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
default:
return -EFAULT;
}
@@ -3357,169 +3542,390 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
}
static int btf_find_field(const struct btf *btf, const struct btf_type *t,
- enum btf_field_type field_type,
- struct btf_field_info *info, int info_cnt)
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
{
- const char *name;
- int sz, align;
-
- switch (field_type) {
- case BTF_FIELD_SPIN_LOCK:
- name = "bpf_spin_lock";
- sz = sizeof(struct bpf_spin_lock);
- align = __alignof__(struct bpf_spin_lock);
- break;
- case BTF_FIELD_TIMER:
- name = "bpf_timer";
- sz = sizeof(struct bpf_timer);
- align = __alignof__(struct bpf_timer);
- break;
- case BTF_FIELD_KPTR:
- name = NULL;
- sz = sizeof(u64);
- align = 8;
- break;
- default:
- return -EFAULT;
- }
-
if (__btf_type_is_struct(t))
- return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt);
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt);
else if (btf_type_is_datasec(t))
- return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt);
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt);
return -EINVAL;
}
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
{
- struct btf_field_info info;
+ struct module *mod = NULL;
+ const struct btf_type *t;
+ struct btf *kernel_btf;
int ret;
+ s32 id;
- ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1);
- if (ret < 0)
- return ret;
- if (!ret)
- return -ENOENT;
- return info.off;
+ /* Find type in map BTF, and use it to look up the matching type
+ * in vmlinux or module BTFs, by name and kind.
+ */
+ t = btf_type_by_id(btf, info->kptr.type_id);
+ id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
+ &kernel_btf);
+ if (id < 0)
+ return id;
+
+ /* Find and stash the function pointer for the destruction function that
+ * needs to be eventually invoked from the map free path.
+ */
+ if (info->type == BPF_KPTR_REF) {
+ const struct btf_type *dtor_func;
+ const char *dtor_func_name;
+ unsigned long addr;
+ s32 dtor_btf_id;
+
+ /* This call also serves as a whitelist of allowed objects that
+ * can be used as a referenced pointer and be stored in a map at
+ * the same time.
+ */
+ dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+ if (dtor_btf_id < 0) {
+ ret = dtor_btf_id;
+ goto end_btf;
+ }
+
+ dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+ if (!dtor_func) {
+ ret = -ENOENT;
+ goto end_btf;
+ }
+
+ if (btf_is_module(kernel_btf)) {
+ mod = btf_try_get_module(kernel_btf);
+ if (!mod) {
+ ret = -ENXIO;
+ goto end_btf;
+ }
+ }
+
+ /* We already verified dtor_func to be btf_type_is_func
+ * in register_btf_id_dtor_kfuncs.
+ */
+ dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+ addr = kallsyms_lookup_name(dtor_func_name);
+ if (!addr) {
+ ret = -EINVAL;
+ goto end_mod;
+ }
+ field->kptr.dtor = (void *)addr;
+ }
+
+ field->kptr.btf_id = id;
+ field->kptr.btf = kernel_btf;
+ field->kptr.module = mod;
+ return 0;
+end_mod:
+ module_put(mod);
+end_btf:
+ btf_put(kernel_btf);
+ return ret;
}
-int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+static int btf_parse_graph_root(const struct btf *btf,
+ struct btf_field *field,
+ struct btf_field_info *info,
+ const char *node_type_name,
+ size_t node_type_align)
{
- struct btf_field_info info;
- int ret;
+ const struct btf_type *t, *n = NULL;
+ const struct btf_member *member;
+ u32 offset;
+ int i;
- ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1);
- if (ret < 0)
- return ret;
- if (!ret)
+ t = btf_type_by_id(btf, info->graph_root.value_btf_id);
+ /* We've already checked that value_btf_id is a struct type. We
+ * just need to figure out the offset of the list_node, and
+ * verify its type.
+ */
+ for_each_member(i, t, member) {
+ if (strcmp(info->graph_root.node_name,
+ __btf_name_by_offset(btf, member->name_off)))
+ continue;
+ /* Invalid BTF, two members with same name */
+ if (n)
+ return -EINVAL;
+ n = btf_type_by_id(btf, member->type);
+ if (!__btf_type_is_struct(n))
+ return -EINVAL;
+ if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off)))
+ return -EINVAL;
+ offset = __btf_member_bit_offset(n, member);
+ if (offset % 8)
+ return -EINVAL;
+ offset /= 8;
+ if (offset % node_type_align)
+ return -EINVAL;
+
+ field->graph_root.btf = (struct btf *)btf;
+ field->graph_root.value_btf_id = info->graph_root.value_btf_id;
+ field->graph_root.node_offset = offset;
+ }
+ if (!n)
return -ENOENT;
- return info.off;
+ return 0;
}
-struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
- const struct btf_type *t)
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
{
- struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX];
- struct bpf_map_value_off *tab;
- struct btf *kernel_btf = NULL;
- struct module *mod = NULL;
- int ret, i, nr_off;
+ return btf_parse_graph_root(btf, field, info, "bpf_list_node",
+ __alignof__(struct bpf_list_node));
+}
- ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr));
+static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_rb_node",
+ __alignof__(struct bpf_rb_node));
+}
+
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, u32 value_size)
+{
+ struct btf_field_info info_arr[BTF_FIELDS_MAX];
+ struct btf_record *rec;
+ u32 next_off = 0;
+ int ret, i, cnt;
+
+ ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
if (ret < 0)
return ERR_PTR(ret);
if (!ret)
return NULL;
- nr_off = ret;
- tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN);
- if (!tab)
+ cnt = ret;
+ /* This needs to be kzalloc to zero out padding and unused fields, see
+ * comment in btf_record_equal.
+ */
+ rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN);
+ if (!rec)
return ERR_PTR(-ENOMEM);
- for (i = 0; i < nr_off; i++) {
- const struct btf_type *t;
- s32 id;
+ rec->spin_lock_off = -EINVAL;
+ rec->timer_off = -EINVAL;
+ for (i = 0; i < cnt; i++) {
+ if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) {
+ WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
+ ret = -EFAULT;
+ goto end;
+ }
+ if (info_arr[i].off < next_off) {
+ ret = -EEXIST;
+ goto end;
+ }
+ next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
- /* Find type in map BTF, and use it to look up the matching type
- * in vmlinux or module BTFs, by name and kind.
- */
- t = btf_type_by_id(btf, info_arr[i].type_id);
- id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
- &kernel_btf);
- if (id < 0) {
- ret = id;
+ rec->field_mask |= info_arr[i].type;
+ rec->fields[i].offset = info_arr[i].off;
+ rec->fields[i].type = info_arr[i].type;
+
+ switch (info_arr[i].type) {
+ case BPF_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->spin_lock_off = rec->fields[i].offset;
+ break;
+ case BPF_TIMER:
+ WARN_ON_ONCE(rec->timer_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->timer_off = rec->fields[i].offset;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_HEAD:
+ ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_RB_ROOT:
+ ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ break;
+ default:
+ ret = -EFAULT;
goto end;
}
+ rec->cnt++;
+ }
- /* Find and stash the function pointer for the destruction function that
- * needs to be eventually invoked from the map free path.
- */
- if (info_arr[i].type == BPF_KPTR_REF) {
- const struct btf_type *dtor_func;
- const char *dtor_func_name;
- unsigned long addr;
- s32 dtor_btf_id;
-
- /* This call also serves as a whitelist of allowed objects that
- * can be used as a referenced pointer and be stored in a map at
- * the same time.
- */
- dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
- if (dtor_btf_id < 0) {
- ret = dtor_btf_id;
- goto end_btf;
- }
+ /* bpf_{list_head, rb_node} require bpf_spin_lock */
+ if ((btf_record_has_field(rec, BPF_LIST_HEAD) ||
+ btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) {
+ ret = -EINVAL;
+ goto end;
+ }
- dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
- if (!dtor_func) {
- ret = -ENOENT;
- goto end_btf;
- }
+ /* need collection identity for non-owning refs before allowing this
+ *
+ * Consider a node type w/ both list and rb_node fields:
+ * struct node {
+ * struct bpf_list_node l;
+ * struct bpf_rb_node r;
+ * }
+ *
+ * Used like so:
+ * struct node *n = bpf_obj_new(....);
+ * bpf_list_push_front(&list_head, &n->l);
+ * bpf_rbtree_remove(&rb_root, &n->r);
+ *
+ * It should not be possible to rbtree_remove the node since it hasn't
+ * been added to a tree. But push_front converts n to a non-owning
+ * reference, and rbtree_remove accepts the non-owning reference to
+ * a type w/ bpf_rb_node field.
+ */
+ if (btf_record_has_field(rec, BPF_LIST_NODE) &&
+ btf_record_has_field(rec, BPF_RB_NODE)) {
+ ret = -EINVAL;
+ goto end;
+ }
- if (btf_is_module(kernel_btf)) {
- mod = btf_try_get_module(kernel_btf);
- if (!mod) {
- ret = -ENXIO;
- goto end_btf;
- }
- }
+ return rec;
+end:
+ btf_record_free(rec);
+ return ERR_PTR(ret);
+}
- /* We already verified dtor_func to be btf_type_is_func
- * in register_btf_id_dtor_kfuncs.
- */
- dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
- addr = kallsyms_lookup_name(dtor_func_name);
- if (!addr) {
- ret = -EINVAL;
- goto end_mod;
- }
- tab->off[i].kptr.dtor = (void *)addr;
- }
+#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT)
+#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE)
- tab->off[i].offset = info_arr[i].off;
- tab->off[i].type = info_arr[i].type;
- tab->off[i].kptr.btf_id = id;
- tab->off[i].kptr.btf = kernel_btf;
- tab->off[i].kptr.module = mod;
+int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
+{
+ int i;
+
+ /* There are three types that signify ownership of some other type:
+ * kptr_ref, bpf_list_head, bpf_rb_root.
+ * kptr_ref only supports storing kernel types, which can't store
+ * references to program allocated local types.
+ *
+ * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
+ * does not form cycles.
+ */
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK))
+ return 0;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *meta;
+ u32 btf_id;
+
+ if (!(rec->fields[i].type & GRAPH_ROOT_MASK))
+ continue;
+ btf_id = rec->fields[i].graph_root.value_btf_id;
+ meta = btf_find_struct_meta(btf, btf_id);
+ if (!meta)
+ return -EFAULT;
+ rec->fields[i].graph_root.value_rec = meta->record;
+
+ /* We need to set value_rec for all root types, but no need
+ * to check ownership cycle for a type unless it's also a
+ * node type.
+ */
+ if (!(rec->field_mask & GRAPH_NODE_MASK))
+ continue;
+
+ /* We need to ensure ownership acyclicity among all types. The
+ * proper way to do it would be to topologically sort all BTF
+ * IDs based on the ownership edges, since there can be multiple
+ * bpf_{list_head,rb_node} in a type. Instead, we use the
+ * following resaoning:
+ *
+ * - A type can only be owned by another type in user BTF if it
+ * has a bpf_{list,rb}_node. Let's call these node types.
+ * - A type can only _own_ another type in user BTF if it has a
+ * bpf_{list_head,rb_root}. Let's call these root types.
+ *
+ * We ensure that if a type is both a root and node, its
+ * element types cannot be root types.
+ *
+ * To ensure acyclicity:
+ *
+ * When A is an root type but not a node, its ownership
+ * chain can be:
+ * A -> B -> C
+ * Where:
+ * - A is an root, e.g. has bpf_rb_root.
+ * - B is both a root and node, e.g. has bpf_rb_node and
+ * bpf_list_head.
+ * - C is only an root, e.g. has bpf_list_node
+ *
+ * When A is both a root and node, some other type already
+ * owns it in the BTF domain, hence it can not own
+ * another root type through any of the ownership edges.
+ * A -> B
+ * Where:
+ * - A is both an root and node.
+ * - B is only an node.
+ */
+ if (meta->record->field_mask & GRAPH_ROOT_MASK)
+ return -ELOOP;
}
- tab->nr_off = nr_off;
- return tab;
-end_mod:
- module_put(mod);
-end_btf:
- btf_put(kernel_btf);
-end:
- while (i--) {
- btf_put(tab->off[i].kptr.btf);
- if (tab->off[i].kptr.module)
- module_put(tab->off[i].kptr.module);
+ return 0;
+}
+
+static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv)
+{
+ const u32 a = *(const u32 *)_a;
+ const u32 b = *(const u32 *)_b;
+
+ if (a < b)
+ return -1;
+ else if (a > b)
+ return 1;
+ return 0;
+}
+
+static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv)
+{
+ struct btf_field_offs *foffs = (void *)priv;
+ u32 *off_base = foffs->field_off;
+ u32 *a = _a, *b = _b;
+ u8 *sz_a, *sz_b;
+
+ sz_a = foffs->field_sz + (a - off_base);
+ sz_b = foffs->field_sz + (b - off_base);
+
+ swap(*a, *b);
+ swap(*sz_a, *sz_b);
+}
+
+struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec)
+{
+ struct btf_field_offs *foffs;
+ u32 i, *off;
+ u8 *sz;
+
+ BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz));
+ if (IS_ERR_OR_NULL(rec))
+ return NULL;
+
+ foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN);
+ if (!foffs)
+ return ERR_PTR(-ENOMEM);
+
+ off = foffs->field_off;
+ sz = foffs->field_sz;
+ for (i = 0; i < rec->cnt; i++) {
+ off[i] = rec->fields[i].offset;
+ sz[i] = btf_field_type_size(rec->fields[i].type);
}
- kfree(tab);
- return ERR_PTR(ret);
+ foffs->cnt = rec->cnt;
+
+ if (foffs->cnt == 1)
+ return foffs;
+ sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]),
+ btf_field_offs_cmp, btf_field_offs_swap, foffs);
+ return foffs;
}
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
@@ -4468,7 +4874,6 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
nr_args--;
}
- err = 0;
for (i = 0; i < nr_args; i++) {
const struct btf_type *arg_type;
u32 arg_type_id;
@@ -4477,8 +4882,12 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
arg_type = btf_type_by_id(btf, arg_type_id);
if (!arg_type) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
+ }
+
+ if (btf_type_is_resolve_source_only(arg_type)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
}
if (args[i].name_off &&
@@ -4486,25 +4895,23 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
!btf_name_valid_identifier(btf, args[i].name_off))) {
btf_verifier_log_type(env, t,
"Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
if (btf_type_needs_resolve(arg_type) &&
!env_type_is_resolved(env, arg_type_id)) {
err = btf_resolve(env, arg_type, arg_type_id);
if (err)
- break;
+ return err;
}
if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
}
- return err;
+ return 0;
}
static int btf_func_check(struct btf_verifier_env *env,
@@ -4918,6 +5325,122 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return btf_check_sec_info(env, btf_data_size);
}
+static const char *alloc_obj_fields[] = {
+ "bpf_spin_lock",
+ "bpf_list_head",
+ "bpf_list_node",
+ "bpf_rb_root",
+ "bpf_rb_node",
+};
+
+static struct btf_struct_metas *
+btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
+{
+ union {
+ struct btf_id_set set;
+ struct {
+ u32 _cnt;
+ u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
+ } _arr;
+ } aof;
+ struct btf_struct_metas *tab = NULL;
+ int i, n, id, ret;
+
+ BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
+ BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
+
+ memset(&aof, 0, sizeof(aof));
+ for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
+ /* Try to find whether this special type exists in user BTF, and
+ * if so remember its ID so we can easily find it among members
+ * of structs that we iterate in the next loop.
+ */
+ id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
+ if (id < 0)
+ continue;
+ aof.set.ids[aof.set.cnt++] = id;
+ }
+
+ if (!aof.set.cnt)
+ return NULL;
+ sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ struct btf_struct_metas *new_tab;
+ const struct btf_member *member;
+ struct btf_field_offs *foffs;
+ struct btf_struct_meta *type;
+ struct btf_record *record;
+ const struct btf_type *t;
+ int j, tab_cnt;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free;
+ }
+ if (!__btf_type_is_struct(t))
+ continue;
+
+ cond_resched();
+
+ for_each_member(j, t, member) {
+ if (btf_id_set_contains(&aof.set, member->type))
+ goto parse;
+ }
+ continue;
+ parse:
+ tab_cnt = tab ? tab->cnt : 0;
+ new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_tab) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ if (!tab)
+ new_tab->cnt = 0;
+ tab = new_tab;
+
+ type = &tab->types[tab->cnt];
+ type->btf_id = i;
+ record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
+ BPF_RB_ROOT | BPF_RB_NODE, t->size);
+ /* The record cannot be unset, treat it as an error if so */
+ if (IS_ERR_OR_NULL(record)) {
+ ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
+ goto free;
+ }
+ foffs = btf_parse_field_offs(record);
+ /* We need the field_offs to be valid for a valid record,
+ * either both should be set or both should be unset.
+ */
+ if (IS_ERR_OR_NULL(foffs)) {
+ btf_record_free(record);
+ ret = -EFAULT;
+ goto free;
+ }
+ type->record = record;
+ type->field_offs = foffs;
+ tab->cnt++;
+ }
+ return tab;
+free:
+ btf_struct_metas_free(tab);
+ return ERR_PTR(ret);
+}
+
+struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
+{
+ struct btf_struct_metas *tab;
+
+ BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0);
+ tab = btf->struct_meta_tab;
+ if (!tab)
+ return NULL;
+ return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func);
+}
+
static int btf_check_type_tags(struct btf_verifier_env *env,
struct btf *btf, int start_id)
{
@@ -4968,6 +5491,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
u32 log_level, char __user *log_ubuf, u32 log_size)
{
+ struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
struct btf *btf = NULL;
@@ -5036,15 +5560,34 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
if (err)
goto errout;
+ struct_meta_tab = btf_parse_struct_metas(log, btf);
+ if (IS_ERR(struct_meta_tab)) {
+ err = PTR_ERR(struct_meta_tab);
+ goto errout;
+ }
+ btf->struct_meta_tab = struct_meta_tab;
+
+ if (struct_meta_tab) {
+ int i;
+
+ for (i = 0; i < struct_meta_tab->cnt; i++) {
+ err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record);
+ if (err < 0)
+ goto errout_meta;
+ }
+ }
+
if (log->level && bpf_verifier_log_full(log)) {
err = -ENOSPC;
- goto errout;
+ goto errout_meta;
}
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
+errout_meta:
+ btf_free_struct_meta_tab(btf);
errout:
btf_verifier_env_free(env);
if (btf)
@@ -5086,7 +5629,7 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
-static const struct btf_member *
+const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
@@ -5126,6 +5669,7 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
if (!ctx_struct)
/* should not happen */
return NULL;
+again:
ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
if (!ctx_tname) {
/* should not happen */
@@ -5139,8 +5683,16 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
- if (strcmp(ctx_tname, tname))
- return NULL;
+ if (strcmp(ctx_tname, tname)) {
+ /* bpf_user_pt_regs_t is a typedef, so resolve it to
+ * underlying struct and check name again
+ */
+ if (!btf_type_is_modifier(ctx_struct))
+ return NULL;
+ while (btf_type_is_modifier(ctx_struct))
+ ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type);
+ goto again;
+ }
return ctx_type;
}
@@ -5159,6 +5711,26 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
return kern_ctx_type->type;
}
+int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
+{
+ const struct btf_member *kctx_member;
+ const struct btf_type *conv_struct;
+ const struct btf_type *kctx_type;
+ u32 kctx_type_id;
+
+ conv_struct = bpf_ctx_convert.t;
+ /* get member for kernel ctx type */
+ kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ kctx_type_id = kctx_member->type;
+ kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id);
+ if (!btf_type_is_struct(kctx_type)) {
+ bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id);
+ return -EINVAL;
+ }
+
+ return kctx_type_id;
+}
+
BTF_ID_LIST(bpf_ctx_convert_btf_id)
BTF_ID(struct, bpf_ctx_convert)
@@ -5356,6 +5928,22 @@ static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
return nr_args + 1;
}
+static bool prog_args_trusted(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type atype = prog->expected_attach_type;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER;
+ case BPF_PROG_TYPE_LSM:
+ return bpf_lsm_is_trusted(prog);
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -5499,6 +6087,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
info->reg_type = PTR_TO_BTF_ID;
+ if (prog_args_trusted(prog))
+ info->reg_type |= PTR_TRUSTED;
+
if (tgt_prog) {
enum bpf_prog_type tgt_type;
@@ -5765,6 +6356,9 @@ error:
/* check __percpu tag */
if (strcmp(tag_value, "percpu") == 0)
tmp_flag = MEM_PERCPU;
+ /* check __rcu tag */
+ if (strcmp(tag_value, "rcu") == 0)
+ tmp_flag = MEM_RCU;
}
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
@@ -5794,20 +6388,50 @@ error:
return -EINVAL;
}
-int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, int off, int size,
- enum bpf_access_type atype __maybe_unused,
+int btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype __maybe_unused,
u32 *next_btf_id, enum bpf_type_flag *flag)
{
+ const struct btf *btf = reg->btf;
enum bpf_type_flag tmp_flag = 0;
+ const struct btf_type *t;
+ u32 id = reg->btf_id;
int err;
- u32 id;
+ while (type_is_alloc(reg->type)) {
+ struct btf_struct_meta *meta;
+ struct btf_record *rec;
+ int i;
+
+ meta = btf_find_struct_meta(btf, id);
+ if (!meta)
+ break;
+ rec = meta->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 offset = field->offset;
+ if (off < offset + btf_field_type_size(field->type) && offset < off + size) {
+ bpf_log(log,
+ "direct access to %s is disallowed\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ }
+ break;
+ }
+
+ t = btf_type_by_id(btf, id);
do {
err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
switch (err) {
case WALK_PTR:
+ /* For local types, the destination register cannot
+ * become a pointer again.
+ */
+ if (type_is_alloc(reg->type))
+ return SCALAR_VALUE;
/* If we found the pointer or scalar on t+off,
* we're done.
*/
@@ -5842,8 +6466,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
* end up with two different module BTFs, but IDs point to the common type in
* vmlinux BTF.
*/
-static bool btf_types_are_same(const struct btf *btf1, u32 id1,
- const struct btf *btf2, u32 id2)
+bool btf_types_are_same(const struct btf *btf1, u32 id1,
+ const struct btf *btf2, u32 id2)
{
if (id1 != id2)
return false;
@@ -5914,6 +6538,18 @@ static int __get_type_size(struct btf *btf, u32 btf_id,
return -EINVAL;
}
+static u8 __get_type_fmodel_flags(const struct btf_type *t)
+{
+ u8 flags = 0;
+
+ if (__btf_type_is_struct(t))
+ flags |= BTF_FMODEL_STRUCT_ARG;
+ if (btf_type_is_signed_int(t))
+ flags |= BTF_FMODEL_SIGNED_ARG;
+
+ return flags;
+}
+
int btf_distill_func_proto(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *func,
@@ -5934,6 +6570,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
m->arg_flags[i] = 0;
}
m->ret_size = 8;
+ m->ret_flags = 0;
m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
}
@@ -5953,6 +6590,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
m->ret_size = ret;
+ m->ret_flags = __get_type_fmodel_flags(t);
for (i = 0; i < nargs; i++) {
if (i == nargs - 1 && args[i].type == 0) {
@@ -5977,7 +6615,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
m->arg_size[i] = ret;
- m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0;
+ m->arg_flags[i] = __get_type_fmodel_flags(t);
}
m->nr_args = nargs;
return 0;
@@ -6125,122 +6763,19 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
-#ifdef CONFIG_NET
- [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
- [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
- [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
-#endif
-};
-
-/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
-static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
- const struct btf *btf,
- const struct btf_type *t, int rec)
-{
- const struct btf_type *member_type;
- const struct btf_member *member;
- u32 i;
-
- if (!btf_type_is_struct(t))
- return false;
-
- for_each_member(i, t, member) {
- const struct btf_array *array;
-
- member_type = btf_type_skip_modifiers(btf, member->type, NULL);
- if (btf_type_is_struct(member_type)) {
- if (rec >= 3) {
- bpf_log(log, "max struct nesting depth exceeded\n");
- return false;
- }
- if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
- return false;
- continue;
- }
- if (btf_type_is_array(member_type)) {
- array = btf_type_array(member_type);
- if (!array->nelems)
- return false;
- member_type = btf_type_skip_modifiers(btf, array->type, NULL);
- if (!btf_type_is_scalar(member_type))
- return false;
- continue;
- }
- if (!btf_type_is_scalar(member_type))
- return false;
- }
- return true;
-}
-
-static bool is_kfunc_arg_mem_size(const struct btf *btf,
- const struct btf_param *arg,
- const struct bpf_reg_state *reg)
-{
- int len, sfx_len = sizeof("__sz") - 1;
- const struct btf_type *t;
- const char *param_name;
-
- t = btf_type_skip_modifiers(btf, arg->type, NULL);
- if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
- return false;
-
- /* In the future, this can be ported to use BTF tagging */
- param_name = btf_name_by_offset(btf, arg->name_off);
- if (str_is_empty(param_name))
- return false;
- len = strlen(param_name);
- if (len < sfx_len)
- return false;
- param_name += len - sfx_len;
- if (strncmp(param_name, "__sz", sfx_len))
- return false;
-
- return true;
-}
-
-static bool btf_is_kfunc_arg_mem_size(const struct btf *btf,
- const struct btf_param *arg,
- const struct bpf_reg_state *reg,
- const char *name)
-{
- int len, target_len = strlen(name);
- const struct btf_type *t;
- const char *param_name;
-
- t = btf_type_skip_modifiers(btf, arg->type, NULL);
- if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
- return false;
-
- param_name = btf_name_by_offset(btf, arg->name_off);
- if (str_is_empty(param_name))
- return false;
- len = strlen(param_name);
- if (len != target_len)
- return false;
- if (strcmp(param_name, name))
- return false;
-
- return true;
-}
-
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok,
- struct bpf_kfunc_arg_meta *kfunc_meta,
bool processing_call)
{
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
- bool rel = false, kptr_get = false, trusted_args = false;
- bool sleepable = false;
struct bpf_verifier_log *log = &env->log;
- u32 i, nargs, ref_id, ref_obj_id = 0;
- bool is_kfunc = btf_is_kernel(btf);
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
- int ref_regno = 0, ret;
+ u32 i, nargs, ref_id;
+ int ret;
t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
@@ -6266,14 +6801,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (is_kfunc && kfunc_meta) {
- /* Only kfunc can be release func */
- rel = kfunc_meta->flags & KF_RELEASE;
- kptr_get = kfunc_meta->flags & KF_KPTR_GET;
- trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
- sleepable = kfunc_meta->flags & KF_SLEEPABLE;
- }
-
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
@@ -6281,42 +6808,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
enum bpf_arg_type arg_type = ARG_DONTCARE;
u32 regno = i + 1;
struct bpf_reg_state *reg = &regs[regno];
- bool obj_ptr = false;
t = btf_type_skip_modifiers(btf, args[i].type, NULL);
if (btf_type_is_scalar(t)) {
- if (is_kfunc && kfunc_meta) {
- bool is_buf_size = false;
-
- /* check for any const scalar parameter of name "rdonly_buf_size"
- * or "rdwr_buf_size"
- */
- if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
- "rdonly_buf_size")) {
- kfunc_meta->r0_rdonly = true;
- is_buf_size = true;
- } else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
- "rdwr_buf_size"))
- is_buf_size = true;
-
- if (is_buf_size) {
- if (kfunc_meta->r0_size) {
- bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
- return -EINVAL;
- }
-
- if (!tnum_is_const(reg->var_off)) {
- bpf_log(log, "R%d is not a const\n", regno);
- return -EINVAL;
- }
-
- kfunc_meta->r0_size = reg->var_off.value;
- ret = mark_chain_precision(env, regno);
- if (ret)
- return ret;
- }
- }
-
if (reg->type == SCALAR_VALUE)
continue;
bpf_log(log, "R%d is not a scalar\n", regno);
@@ -6329,88 +6823,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
- /* These register types have special constraints wrt ref_obj_id
- * and offset checks. The rest of trusted args don't.
- */
- obj_ptr = reg->type == PTR_TO_CTX || reg->type == PTR_TO_BTF_ID ||
- reg2btf_ids[base_type(reg->type)];
-
- /* Check if argument must be a referenced pointer, args + i has
- * been verified to be a pointer (after skipping modifiers).
- * PTR_TO_CTX is ok without having non-zero ref_obj_id.
- */
- if (is_kfunc && trusted_args && (obj_ptr && reg->type != PTR_TO_CTX) && !reg->ref_obj_id) {
- bpf_log(log, "R%d must be referenced\n", regno);
- return -EINVAL;
- }
-
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- /* Trusted args have the same offset checks as release arguments */
- if ((trusted_args && obj_ptr) || (rel && reg->ref_obj_id))
- arg_type |= OBJ_RELEASE;
ret = check_func_arg_reg_off(env, reg, regno, arg_type);
if (ret < 0)
return ret;
- if (is_kfunc && reg->ref_obj_id) {
- /* Ensure only one argument is referenced PTR_TO_BTF_ID */
- if (ref_obj_id) {
- bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
- regno, reg->ref_obj_id, ref_obj_id);
- return -EFAULT;
- }
- ref_regno = regno;
- ref_obj_id = reg->ref_obj_id;
- }
-
- /* kptr_get is only true for kfunc */
- if (i == 0 && kptr_get) {
- struct bpf_map_value_off_desc *off_desc;
-
- if (reg->type != PTR_TO_MAP_VALUE) {
- bpf_log(log, "arg#0 expected pointer to map value\n");
- return -EINVAL;
- }
-
- /* check_func_arg_reg_off allows var_off for
- * PTR_TO_MAP_VALUE, but we need fixed offset to find
- * off_desc.
- */
- if (!tnum_is_const(reg->var_off)) {
- bpf_log(log, "arg#0 must have constant offset\n");
- return -EINVAL;
- }
-
- off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value);
- if (!off_desc || off_desc->type != BPF_KPTR_REF) {
- bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n",
- reg->off + reg->var_off.value);
- return -EINVAL;
- }
-
- if (!btf_type_is_ptr(ref_t)) {
- bpf_log(log, "arg#0 BTF type must be a double pointer\n");
- return -EINVAL;
- }
-
- ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id);
- ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-
- if (!btf_type_is_struct(ref_t)) {
- bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
- func_name, i, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf,
- off_desc->kptr.btf_id, true)) {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n",
- func_name, i, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- /* rest of the arguments can be anything, like normal kfunc */
- } else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
@@ -6420,109 +6840,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
i, btf_type_str(t));
return -EINVAL;
}
- } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
- (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
- const struct btf_type *reg_ref_t;
- const struct btf *reg_btf;
- const char *reg_ref_tname;
- u32 reg_ref_id;
-
- if (!btf_type_is_struct(ref_t)) {
- bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
- func_name, i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
-
- if (reg->type == PTR_TO_BTF_ID) {
- reg_btf = reg->btf;
- reg_ref_id = reg->btf_id;
- } else {
- reg_btf = btf_vmlinux;
- reg_ref_id = *reg2btf_ids[base_type(reg->type)];
- }
-
- reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
- &reg_ref_id);
- reg_ref_tname = btf_name_by_offset(reg_btf,
- reg_ref_t->name_off);
- if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
- reg->off, btf, ref_id,
- trusted_args || (rel && reg->ref_obj_id))) {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
- func_name, i,
- btf_type_str(ref_t), ref_tname,
- regno, btf_type_str(reg_ref_t),
- reg_ref_tname);
- return -EINVAL;
- }
} else if (ptr_to_mem_ok && processing_call) {
const struct btf_type *resolve_ret;
u32 type_size;
- if (is_kfunc) {
- bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
- bool arg_dynptr = btf_type_is_struct(ref_t) &&
- !strcmp(ref_tname,
- stringify_struct(bpf_dynptr_kern));
-
- /* Permit pointer to mem, but only when argument
- * type is pointer to scalar, or struct composed
- * (recursively) of scalars.
- * When arg_mem_size is true, the pointer can be
- * void *.
- * Also permit initialized local dynamic pointers.
- */
- if (!btf_type_is_scalar(ref_t) &&
- !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
- !arg_dynptr &&
- (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
- bpf_log(log,
- "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
- i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
- return -EINVAL;
- }
-
- if (arg_dynptr) {
- if (reg->type != PTR_TO_STACK) {
- bpf_log(log, "arg#%d pointer type %s %s not to stack\n",
- i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
-
- if (!is_dynptr_reg_valid_init(env, reg)) {
- bpf_log(log,
- "arg#%d pointer type %s %s must be valid and initialized\n",
- i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
-
- if (!is_dynptr_type_expected(env, reg,
- ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) {
- bpf_log(log,
- "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n",
- i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
-
- continue;
- }
-
- /* Check for mem, len pair */
- if (arg_mem_size) {
- if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
- bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n",
- i, i + 1);
- return -EINVAL;
- }
- i++;
- continue;
- }
- }
-
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
if (IS_ERR(resolve_ret)) {
bpf_log(log,
@@ -6535,36 +6856,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (check_mem_reg(env, reg, regno, type_size))
return -EINVAL;
} else {
- bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
- is_kfunc ? "kernel " : "", func_name, func_id);
+ bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
+ func_name, func_id);
return -EINVAL;
}
}
- /* Either both are set, or neither */
- WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno));
- /* We already made sure ref_obj_id is set only for one argument. We do
- * allow (!rel && ref_obj_id), so that passing such referenced
- * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when
- * is_kfunc is true.
- */
- if (rel && !ref_obj_id) {
- bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
- func_name);
- return -EINVAL;
- }
-
- if (sleepable && !env->prog->aux->sleepable) {
- bpf_log(log, "kernel function %s is sleepable but the program is not\n",
- func_name);
- return -EINVAL;
- }
-
- if (kfunc_meta && ref_obj_id)
- kfunc_meta->ref_obj_id = ref_obj_id;
-
- /* returns argument register number > 0 in case of reference release kfunc */
- return rel ? ref_regno : 0;
+ return 0;
}
/* Compare BTF of a function declaration with given bpf_reg_state.
@@ -6594,7 +6892,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false);
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false);
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
@@ -6637,7 +6935,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true);
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true);
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
@@ -6648,14 +6946,6 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
return err;
}
-int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
- const struct btf *btf, u32 func_id,
- struct bpf_reg_state *regs,
- struct bpf_kfunc_arg_meta *meta)
-{
- return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true);
-}
-
/* Convert BTF of a function into bpf_reg_state if possible
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
@@ -7038,23 +7328,6 @@ bool btf_is_module(const struct btf *btf)
return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
}
-static int btf_id_cmp_func(const void *a, const void *b)
-{
- const int *pa = a, *pb = b;
-
- return *pa - *pb;
-}
-
-bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
-{
- return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
-}
-
-static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
-{
- return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
-}
-
enum {
BTF_MODULE_F_LIVE = (1 << 0),
};
@@ -7106,11 +7379,14 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
}
btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
if (IS_ERR(btf)) {
- pr_warn("failed to validate module [%s] BTF: %ld\n",
- mod->name, PTR_ERR(btf));
kfree(btf_mod);
- if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
+ pr_warn("failed to validate module [%s] BTF: %ld\n",
+ mod->name, PTR_ERR(btf));
err = PTR_ERR(btf);
+ } else {
+ pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n");
+ }
goto out;
}
err = btf_alloc_id(btf);
@@ -7415,6 +7691,8 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
{
switch (prog_type) {
+ case BPF_PROG_TYPE_UNSPEC:
+ return BTF_KFUNC_HOOK_COMMON;
case BPF_PROG_TYPE_XDP:
return BTF_KFUNC_HOOK_XDP;
case BPF_PROG_TYPE_SCHED_CLS:
@@ -7443,16 +7721,24 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf,
u32 kfunc_btf_id)
{
enum btf_kfunc_hook hook;
+ u32 *kfunc_flags;
+
+ kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
+ if (kfunc_flags)
+ return kfunc_flags;
hook = bpf_prog_type_to_kfunc_hook(prog_type);
return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
}
-/* This function must be invoked only from initcalls/module init functions */
-int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
- const struct btf_kfunc_id_set *kset)
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id)
+{
+ return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
+}
+
+static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
{
- enum btf_kfunc_hook hook;
struct btf *btf;
int ret;
@@ -7471,13 +7757,29 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
if (IS_ERR(btf))
return PTR_ERR(btf);
- hook = bpf_prog_type_to_kfunc_hook(prog_type);
ret = btf_populate_kfunc_set(btf, hook, kset->set);
btf_put(btf);
return ret;
}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+ const struct btf_kfunc_id_set *kset)
+{
+ enum btf_kfunc_hook hook;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __register_btf_kfunc_id_set(hook, kset);
+}
EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset)
+{
+ return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set);
+
s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
{
struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
@@ -7602,9 +7904,9 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
- return 0;
end:
- btf_free_dtor_kfunc_tab(btf);
+ if (ret)
+ btf_free_dtor_kfunc_tab(btf);
btf_put(btf);
return ret;
}
@@ -8030,3 +8332,119 @@ out:
}
return err;
}
+
+bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off)
+{
+ struct btf *btf = reg->btf;
+ const struct btf_type *walk_type, *safe_type;
+ const char *tname;
+ char safe_tname[64];
+ long ret, safe_id;
+ const struct btf_member *member, *m_walk = NULL;
+ u32 i;
+ const char *walk_name;
+
+ walk_type = btf_type_by_id(btf, reg->btf_id);
+ if (!walk_type)
+ return false;
+
+ tname = btf_name_by_offset(btf, walk_type->name_off);
+
+ ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname);
+ if (ret < 0)
+ return false;
+
+ safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
+ if (safe_id < 0)
+ return false;
+
+ safe_type = btf_type_by_id(btf, safe_id);
+ if (!safe_type)
+ return false;
+
+ for_each_member(i, walk_type, member) {
+ u32 moff;
+
+ /* We're looking for the PTR_TO_BTF_ID member in the struct
+ * type we're walking which matches the specified offset.
+ * Below, we'll iterate over the fields in the safe variant of
+ * the struct and see if any of them has a matching type /
+ * name.
+ */
+ moff = __btf_member_bit_offset(walk_type, member) / 8;
+ if (off == moff) {
+ m_walk = member;
+ break;
+ }
+ }
+ if (m_walk == NULL)
+ return false;
+
+ walk_name = __btf_name_by_offset(btf, m_walk->name_off);
+ for_each_member(i, safe_type, member) {
+ const char *m_name = __btf_name_by_offset(btf, member->name_off);
+
+ /* If we match on both type and name, the field is considered trusted. */
+ if (m_walk->type == member->type && !strcmp(walk_name, m_name))
+ return true;
+ }
+
+ return false;
+}
+
+bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
+ const struct btf *reg_btf, u32 reg_id,
+ const struct btf *arg_btf, u32 arg_id)
+{
+ const char *reg_name, *arg_name, *search_needle;
+ const struct btf_type *reg_type, *arg_type;
+ int reg_len, arg_len, cmp_len;
+ size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char);
+
+ reg_type = btf_type_by_id(reg_btf, reg_id);
+ if (!reg_type)
+ return false;
+
+ arg_type = btf_type_by_id(arg_btf, arg_id);
+ if (!arg_type)
+ return false;
+
+ reg_name = btf_name_by_offset(reg_btf, reg_type->name_off);
+ arg_name = btf_name_by_offset(arg_btf, arg_type->name_off);
+
+ reg_len = strlen(reg_name);
+ arg_len = strlen(arg_name);
+
+ /* Exactly one of the two type names may be suffixed with ___init, so
+ * if the strings are the same size, they can't possibly be no-cast
+ * aliases of one another. If you have two of the same type names, e.g.
+ * they're both nf_conn___init, it would be improper to return true
+ * because they are _not_ no-cast aliases, they are the same type.
+ */
+ if (reg_len == arg_len)
+ return false;
+
+ /* Either of the two names must be the other name, suffixed with ___init. */
+ if ((reg_len != arg_len + pattern_len) &&
+ (arg_len != reg_len + pattern_len))
+ return false;
+
+ if (reg_len < arg_len) {
+ search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = reg_len;
+ } else {
+ search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = arg_len;
+ }
+
+ if (!search_needle)
+ return false;
+
+ /* ___init suffix must come at the end of the name */
+ if (*(search_needle + pattern_len) != '\0')
+ return false;
+
+ return !strncmp(reg_name, arg_name, cmp_len);
+}
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 9fcf09f2ef00..06989d278846 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -157,23 +157,37 @@ static const struct seq_operations cgroup_iter_seq_ops = {
.show = cgroup_iter_seq_show,
};
-BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
{
struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
struct cgroup *cgrp = aux->cgroup.start;
+ /* bpf_iter_attach_cgroup() has already acquired an extra reference
+ * for the start cgroup, but the reference may be released after
+ * cgroup_iter_seq_init(), so acquire another reference for the
+ * start cgroup.
+ */
p->start_css = &cgrp->self;
+ css_get(p->start_css);
p->terminate = false;
p->visited_all = false;
p->order = aux->cgroup.order;
return 0;
}
+static void cgroup_iter_seq_fini(void *priv)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+
+ css_put(p->start_css);
+}
+
static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
.seq_ops = &cgroup_iter_seq_ops,
.init_seq_private = cgroup_iter_seq_init,
+ .fini_seq_private = cgroup_iter_seq_fini,
.seq_priv_size = sizeof(struct cgroup_iter_priv),
};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 25a54e04560e..b297e9f60ca1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -34,6 +34,9 @@
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
#include <linux/nodemask.h>
+#include <linux/nospec.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
#include <asm/barrier.h>
#include <asm/unaligned.h>
@@ -60,6 +63,9 @@
#define CTX regs[BPF_REG_CTX]
#define IMM insn->imm
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
+
/* No hurry in this branch
*
* Exported for the bpf jit load helper.
@@ -83,7 +89,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
@@ -92,12 +98,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
if (fp == NULL)
return NULL;
- aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+ aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (aux == NULL) {
vfree(fp);
return NULL;
}
- fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+ fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (!fp->active) {
vfree(fp);
kfree(aux);
@@ -122,7 +128,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *prog;
int cpu;
@@ -155,7 +161,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
sizeof(*prog->aux->jited_linfo),
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
if (!prog->aux->jited_linfo)
return -ENOMEM;
@@ -230,7 +236,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *fp;
u32 pages;
@@ -864,8 +870,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr);
- set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
- set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
return pack;
}
@@ -883,8 +888,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
if (ptr) {
bpf_fill_ill_insns(ptr, size);
set_vm_flush_reset_perms(ptr);
- set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
- set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
+ set_memory_rox((unsigned long)ptr, size / PAGE_SIZE);
}
goto out;
}
@@ -1032,7 +1036,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = prandom_u32_max(hole) & ~(alignment - 1);
+ start = get_random_u32_below(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -1094,7 +1098,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
- start = prandom_u32_max(hole) & ~(alignment - 1);
+ start = get_random_u32_below(hole) & ~(alignment - 1);
*image_ptr = &ro_header->image[start];
*rw_image = &(*rw_header)->image[start];
@@ -1908,9 +1912,7 @@ out:
* reuse preexisting logic from Spectre v1 mitigation that
* happens to produce the required code on x86 for v4 as well.
*/
-#ifdef CONFIG_X86
barrier_nospec();
-#endif
CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
@@ -2088,22 +2090,31 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
bool bpf_prog_map_compatible(struct bpf_map *map,
const struct bpf_prog *fp)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
if (fp->kprobe_override)
return false;
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
spin_lock(&map->owner.lock);
if (!map->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- map->owner.type = fp->type;
+ map->owner.type = prog_type;
map->owner.jited = fp->jited;
map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
ret = true;
} else {
- ret = map->owner.type == fp->type &&
+ ret = map->owner.type == prog_type &&
map->owner.jited == fp->jited &&
map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
}
@@ -2179,7 +2190,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- if (!bpf_prog_is_dev_bound(fp->aux)) {
+ if (!bpf_prog_is_offloaded(fp->aux)) {
*err = bpf_prog_alloc_jited_linfo(fp);
if (*err)
return fp;
@@ -2251,8 +2262,14 @@ static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
struct bpf_prog_array *progs;
+ /* If RCU Tasks Trace grace period implies RCU grace period, there is
+ * no need to call kfree_rcu(), just call kfree() directly.
+ */
progs = container_of(rcu, struct bpf_prog_array, rcu);
- kfree_rcu(progs, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(progs);
+ else
+ kfree_rcu(progs, rcu);
}
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
@@ -2545,7 +2562,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
- bpf_prog_offload_destroy(aux->prog);
+ bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
if (aux->prog->has_callchain_buf)
put_callchain_buffers();
@@ -2740,6 +2757,18 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len)
return -ENOTSUPP;
}
+#ifdef CONFIG_BPF_SYSCALL
+static int __init bpf_global_ma_init(void)
+{
+ int ret;
+
+ ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
+ bpf_global_ma_set = !ret;
+ return ret;
+}
+late_initcall(bpf_global_ma_init);
+#endif
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b5ba34ddd4b6..d2110c1f6fa6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -4,13 +4,16 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
*/
-/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+/**
+ * DOC: cpu map
+ * The 'cpumap' is primarily used as a backend map for XDP BPF helper
* call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
*
- * Unlike devmap which redirects XDP frames out another NIC device,
+ * Unlike devmap which redirects XDP frames out to another NIC device,
* this map type redirects raw XDP frames to another CPU. The remote
* CPU will do SKB-allocation and call the normal network stack.
- *
+ */
+/*
* This is a scalability and isolation mechanism, that allow
* separating the early driver network XDP layer, from the rest of the
* netstack, and assigning dedicated CPUs for this stage. This
@@ -85,7 +88,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
- int err = -ENOMEM;
if (!bpf_capable())
return ERR_PTR(-EPERM);
@@ -97,29 +99,26 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (attr->max_entries > NR_CPUS)
+ return ERR_PTR(-E2BIG);
+
cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE);
if (!cmap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&cmap->map, attr);
- /* Pre-limit array size based on NR_CPUS, not final CPU check */
- if (cmap->map.max_entries > NR_CPUS) {
- err = -E2BIG;
- goto free_cmap;
- }
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
- if (!cmap->cpu_map)
- goto free_cmap;
+ if (!cmap->cpu_map) {
+ bpf_map_area_free(cmap);
+ return ERR_PTR(-ENOMEM);
+ }
return &cmap->map;
-free_cmap:
- bpf_map_area_free(cmap);
- return ERR_PTR(err);
}
static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -362,7 +361,7 @@ static int cpu_map_kthread_run(void *data)
/* Support running another XDP prog on this CPU */
nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
if (nframes) {
- m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
+ m = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, skbs);
if (unlikely(m == 0)) {
for (i = 0; i < nframes; i++)
skbs[i] = NULL; /* effect: xdp_return_frame */
@@ -668,9 +667,9 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ return __bpf_xdp_redirect_map(map, index, flags, 0,
__cpu_map_lookup_elem);
}
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
new file mode 100644
index 000000000000..52b981512a35
--- /dev/null
+++ b/kernel/bpf/cpumask.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/cpumask.h>
+
+/**
+ * struct bpf_cpumask - refcounted BPF cpumask wrapper structure
+ * @cpumask: The actual cpumask embedded in the struct.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ *
+ * Note that we explicitly embed a cpumask_t rather than a cpumask_var_t. This
+ * is done to avoid confusing the verifier due to the typedef of cpumask_var_t
+ * changing depending on whether CONFIG_CPUMASK_OFFSTACK is defined or not. See
+ * the details in <linux/cpumask.h>. The consequence is that this structure is
+ * likely a bit larger than it needs to be when CONFIG_CPUMASK_OFFSTACK is
+ * defined due to embedding the whole NR_CPUS-size bitmap, but the extra memory
+ * overhead is minimal. For the more typical case of CONFIG_CPUMASK_OFFSTACK
+ * not being defined, the structure is the same size regardless.
+ */
+struct bpf_cpumask {
+ cpumask_t cpumask;
+ refcount_t usage;
+};
+
+static struct bpf_mem_alloc bpf_cpumask_ma;
+
+static bool cpu_valid(u32 cpu)
+{
+ return cpu < nr_cpu_ids;
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global kfuncs as their definitions will be in BTF");
+
+/**
+ * bpf_cpumask_create() - Create a mutable BPF cpumask.
+ *
+ * Allocates a cpumask that can be queried, mutated, acquired, and released by
+ * a BPF program. The cpumask returned by this function must either be embedded
+ * in a map as a kptr, or freed with bpf_cpumask_release().
+ *
+ * bpf_cpumask_create() allocates memory using the BPF memory allocator, and
+ * will not block. It may return NULL if no memory is available.
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
+{
+ struct bpf_cpumask *cpumask;
+
+ /* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */
+ BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0);
+
+ cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask));
+ if (!cpumask)
+ return NULL;
+
+ memset(cpumask, 0, sizeof(*cpumask));
+ refcount_set(&cpumask->usage, 1);
+
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_acquire() - Acquire a reference to a BPF cpumask.
+ * @cpumask: The BPF cpumask being acquired. The cpumask must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF cpumask. The cpumask returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_cpumask_release().
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
+{
+ refcount_inc(&cpumask->usage);
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask
+ * stored in a map.
+ * @cpumaskp: A pointer to a BPF cpumask map value.
+ *
+ * Attempts to acquire a reference to a BPF cpumask stored in a map value. The
+ * cpumask returned by this function must either be embedded in a map as a
+ * kptr, or freed with bpf_cpumask_release(). This function may return NULL if
+ * no BPF cpumask was found in the specified map value.
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp)
+{
+ struct bpf_cpumask *cpumask;
+
+ /* The BPF memory allocator frees memory backing its caches in an RCU
+ * callback. Thus, we can safely use RCU to ensure that the cpumask is
+ * safe to read.
+ */
+ rcu_read_lock();
+
+ cpumask = READ_ONCE(*cpumaskp);
+ if (cpumask && !refcount_inc_not_zero(&cpumask->usage))
+ cpumask = NULL;
+
+ rcu_read_unlock();
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_release() - Release a previously acquired BPF cpumask.
+ * @cpumask: The cpumask being released.
+ *
+ * Releases a previously acquired reference to a BPF cpumask. When the final
+ * reference of the BPF cpumask has been released, it is subsequently freed in
+ * an RCU callback in the BPF memory allocator.
+ */
+__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
+{
+ if (!cpumask)
+ return;
+
+ if (refcount_dec_and_test(&cpumask->usage)) {
+ migrate_disable();
+ bpf_mem_free(&bpf_cpumask_ma, cpumask);
+ migrate_enable();
+ }
+}
+
+/**
+ * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ */
+__bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
+{
+ return cpumask_first(cpumask);
+}
+
+/**
+ * bpf_cpumask_first_zero() - Get the index of the first unset bit in the
+ * cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
+{
+ return cpumask_first_zero(cpumask);
+}
+
+/**
+ * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be set in the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being set.
+ */
+__bpf_kfunc void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear_cpu() - Clear a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be cleared from the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_cpu() - Test whether a CPU is set in a cpumask.
+ * @cpu: The CPU being queried for.
+ * @cpumask: The cpumask being queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu.
+ */
+__bpf_kfunc bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_set_cpu() - Atomically test and set a CPU in a BPF cpumask.
+ * @cpu: The CPU being set and queried for.
+ * @cpumask: The BPF cpumask being set and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_clear_cpu() - Atomically test and clear a CPU in a BPF
+ * cpumask.
+ * @cpu: The CPU being cleared and queried for.
+ * @cpumask: The BPF cpumask being cleared and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask having all of its bits set.
+ */
+__bpf_kfunc void bpf_cpumask_setall(struct bpf_cpumask *cpumask)
+{
+ cpumask_setall((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear(struct bpf_cpumask *cpumask)
+{
+ cpumask_clear((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_and() - AND two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @dst has at least one bit set following the operation
+ * * false - @dst is empty following the operation
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_and(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_and((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_or() - OR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_or(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_or((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_xor() - XOR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_xor(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_xor((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_equal() - Check two cpumasks for equality.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have the same bits set.
+ * * false - @src1 and @src2 differ in at least one bit.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_equal(src1, src2);
+}
+
+/**
+ * bpf_cpumask_intersects() - Check two cpumasks for overlap.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have at least one of the same bits set.
+ * * false - @src1 and @src2 don't have any of the same bits set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_intersects(src1, src2);
+}
+
+/**
+ * bpf_cpumask_subset() - Check if a cpumask is a subset of another.
+ * @src1: The first cpumask being checked as a subset.
+ * @src2: The second cpumask being checked as a superset.
+ *
+ * Return:
+ * * true - All of the bits of @src1 are set in @src2.
+ * * false - At least one bit in @src1 is not set in @src2.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_subset(src1, src2);
+}
+
+/**
+ * bpf_cpumask_empty() - Check if a cpumask is empty.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - None of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_empty(const struct cpumask *cpumask)
+{
+ return cpumask_empty(cpumask);
+}
+
+/**
+ * bpf_cpumask_full() - Check if a cpumask has all bits set.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - All of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is cleared.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_full(const struct cpumask *cpumask)
+{
+ return cpumask_full(cpumask);
+}
+
+/**
+ * bpf_cpumask_copy() - Copy the contents of a cpumask into a BPF cpumask.
+ * @dst: The BPF cpumask being copied into.
+ * @src: The cpumask being copied.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src)
+{
+ cpumask_copy((struct cpumask *)dst, src);
+}
+
+/**
+ * bpf_cpumask_any() - Return a random set CPU from a cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc u32 bpf_cpumask_any(const struct cpumask *cpumask)
+{
+ return cpumask_any(cpumask);
+}
+
+/**
+ * bpf_cpumask_any_and() - Return a random set CPU from the AND of two
+ * cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_any_and(src1, src2);
+}
+
+__diag_pop();
+
+BTF_SET8_START(cpumask_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS)
+BTF_SET8_END(cpumask_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set cpumask_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &cpumask_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(cpumask_dtor_ids)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(func, bpf_cpumask_release)
+
+static int __init cpumask_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc cpumask_dtors[] = {
+ {
+ .btf_id = cpumask_dtor_ids[0],
+ .kfunc_btf_id = cpumask_dtor_ids[1]
+ },
+ };
+
+ ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
+ ARRAY_SIZE(cpumask_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(cpumask_kfunc_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f9a87dcc5535..2675fefc6cb6 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -474,7 +474,11 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
{
int err;
- if (!dev->netdev_ops->ndo_xdp_xmit)
+ if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return -EOPNOTSUPP;
+
+ if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
return -EOPNOTSUPP;
err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
@@ -532,8 +536,14 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{
- if (!obj ||
- !obj->dev->netdev_ops->ndo_xdp_xmit)
+ if (!obj)
+ return false;
+
+ if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return false;
+
+ if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
return false;
if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
@@ -992,14 +1002,14 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
map, key, value, map_flags);
}
-static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_lookup_elem);
}
-static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index c19719f48ce0..fa3e9225aedc 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -125,6 +125,11 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
__BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func);
+ /* Make sure all the callers executing the previous/old half of the
+ * image leave it, so following update call can modify it safely.
+ */
+ synchronize_rcu();
+
if (new)
d->image_off = noff;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index f39ee3e05589..5dfcb5ad0d06 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -152,7 +152,7 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
{
unsigned long flags;
- hash = hash & HASHTAB_MAP_LOCK_MASK;
+ hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
preempt_disable();
if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
@@ -171,7 +171,7 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
struct bucket *b, u32 hash,
unsigned long flags)
{
- hash = hash & HASHTAB_MAP_LOCK_MASK;
+ hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
raw_spin_unlock_irqrestore(&b->raw_lock, flags);
__this_cpu_dec(*(htab->map_locked[hash]));
preempt_enable();
@@ -222,7 +222,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int i;
- if (!map_value_has_timer(&htab->map))
+ if (!btf_record_has_field(htab->map.record, BPF_TIMER))
return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
@@ -231,28 +231,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_timer_cancel_and_free(elem->key +
- round_up(htab->map.key_size, 8) +
- htab->map.timer_off);
+ bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
cond_resched();
}
}
-static void htab_free_prealloced_kptrs(struct bpf_htab *htab)
+static void htab_free_prealloced_fields(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
- if (!map_value_has_kptrs(&htab->map))
+ if (IS_ERR_OR_NULL(htab->map.record))
return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
-
for (i = 0; i < num_entries; i++) {
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8));
+ bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
cond_resched();
}
}
@@ -764,10 +761,7 @@ static void check_and_free_fields(struct bpf_htab *htab,
{
void *map_value = elem->key + round_up(htab->map.key_size, 8);
- if (map_value_has_timer(&htab->map))
- bpf_timer_cancel_and_free(map_value + htab->map.timer_off);
- if (map_value_has_kptrs(&htab->map))
- bpf_map_free_kptrs(&htab->map, map_value);
+ bpf_obj_free_fields(htab->map.record, map_value);
}
/* It is called from the bpf_lru_list when the LRU needs to delete
@@ -1010,8 +1004,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- check_and_init_map_value(&htab->map,
- l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
@@ -1091,7 +1083,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
head = &b->head;
if (unlikely(map_flags & BPF_F_LOCK)) {
- if (unlikely(!map_value_has_spin_lock(map)))
+ if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
/* find an element without taking the bucket lock */
l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
@@ -1474,12 +1466,8 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
struct htab_elem *l;
hlist_nulls_for_each_entry(l, n, head, hash_node) {
- /* We don't reset or free kptr on uref dropping to zero,
- * hence just free timer.
- */
- bpf_timer_cancel_and_free(l->key +
- round_up(htab->map.key_size, 8) +
- htab->map.timer_off);
+ /* We only free timer on uref dropping to zero */
+ bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
}
cond_resched_rcu();
}
@@ -1490,8 +1478,8 @@ static void htab_map_free_timers(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* We don't reset or free kptr on uref dropping to zero. */
- if (!map_value_has_timer(&htab->map))
+ /* We only free timer on uref dropping to zero */
+ if (!btf_record_has_field(htab->map.record, BPF_TIMER))
return;
if (!htab_is_prealloc(htab))
htab_free_malloced_timers(htab);
@@ -1517,11 +1505,10 @@ static void htab_map_free(struct bpf_map *map)
if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
} else {
- htab_free_prealloced_kptrs(htab);
+ htab_free_prealloced_fields(htab);
prealloc_destroy(htab);
}
- bpf_map_free_kptr_off_tab(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
bpf_mem_alloc_destroy(&htab->pcpu_ma);
@@ -1603,6 +1590,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
else
copy_map_value(map, value, l->key +
roundup_key_size);
+ /* Zeroing special fields in the temp buffer */
check_and_init_map_value(map, value);
}
@@ -1675,7 +1663,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
elem_map_flags = attr->batch.elem_flags;
if ((elem_map_flags & ~BPF_F_LOCK) ||
- ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
map_flags = attr->batch.flags;
@@ -1803,6 +1791,7 @@ again_nocopy:
true);
else
copy_map_value(map, dst_val, value);
+ /* Zeroing special fields in the temp buffer */
check_and_init_map_value(map, dst_val);
}
if (do_delete) {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a6b04faed282..5b278a38ae58 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4,6 +4,7 @@
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/bpf-cgroup.h>
+#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
@@ -19,6 +20,7 @@
#include <linux/proc_ns.h>
#include <linux/security.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
#include "../../lib/kstrtox.h"
@@ -336,6 +338,7 @@ const struct bpf_func_proto bpf_spin_lock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
@@ -358,6 +361,7 @@ const struct bpf_func_proto bpf_spin_unlock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
@@ -366,9 +370,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
struct bpf_spin_lock *lock;
if (lock_src)
- lock = src + map->spin_lock_off;
+ lock = src + map->record->spin_lock_off;
else
- lock = dst + map->spin_lock_off;
+ lock = dst + map->record->spin_lock_off;
preempt_disable();
__bpf_spin_lock_irqsave(lock);
copy_map_value(map, dst, src);
@@ -657,6 +661,7 @@ BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
const struct bpf_func_proto bpf_copy_from_user_proto = {
.func = bpf_copy_from_user,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
@@ -687,6 +692,7 @@ BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
const struct bpf_func_proto bpf_copy_from_user_task_proto = {
.func = bpf_copy_from_user_task,
.gpl_only = true,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
@@ -750,19 +756,20 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
* arguments representation.
*/
-#define MAX_BPRINTF_BUF_LEN 512
+#define MAX_BPRINTF_BIN_ARGS 512
/* Support executing three nested bprintf helper calls on a given CPU */
#define MAX_BPRINTF_NEST_LEVEL 3
struct bpf_bprintf_buffers {
- char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN];
+ char bin_args[MAX_BPRINTF_BIN_ARGS];
+ char buf[MAX_BPRINTF_BUF];
};
-static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs);
+
+static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
-static int try_get_fmt_tmp_buf(char **tmp_buf)
+static int try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
- struct bpf_bprintf_buffers *bufs;
int nest_level;
preempt_disable();
@@ -772,18 +779,19 @@ static int try_get_fmt_tmp_buf(char **tmp_buf)
preempt_enable();
return -EBUSY;
}
- bufs = this_cpu_ptr(&bpf_bprintf_bufs);
- *tmp_buf = bufs->tmp_bufs[nest_level - 1];
+ *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
return 0;
}
-void bpf_bprintf_cleanup(void)
+void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
{
- if (this_cpu_read(bpf_bprintf_nest_level)) {
- this_cpu_dec(bpf_bprintf_nest_level);
- preempt_enable();
- }
+ if (!data->bin_args && !data->buf)
+ return;
+ if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
+ return;
+ this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
}
/*
@@ -792,18 +800,20 @@ void bpf_bprintf_cleanup(void)
* Returns a negative value if fmt is an invalid format string or 0 otherwise.
*
* This can be used in two ways:
- * - Format string verification only: when bin_args is NULL
+ * - Format string verification only: when data->get_bin_args is false
* - Arguments preparation: in addition to the above verification, it writes in
- * bin_args a binary representation of arguments usable by bstr_printf where
- * pointers from BPF have been sanitized.
+ * data->bin_args a binary representation of arguments usable by bstr_printf
+ * where pointers from BPF have been sanitized.
*
* In argument preparation mode, if 0 is returned, safe temporary buffers are
* allocated and bpf_bprintf_cleanup should be called to free them after use.
*/
int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
- u32 **bin_args, u32 num_args)
+ u32 num_args, struct bpf_bprintf_data *data)
{
+ bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
+ struct bpf_bprintf_buffers *buffers = NULL;
size_t sizeof_cur_arg, sizeof_cur_ip;
int err, i, num_spec = 0;
u64 cur_arg;
@@ -814,14 +824,19 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
return -EINVAL;
fmt_size = fmt_end - fmt;
- if (bin_args) {
- if (num_args && try_get_fmt_tmp_buf(&tmp_buf))
- return -EBUSY;
+ if (get_buffers && try_get_buffers(&buffers))
+ return -EBUSY;
- tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN;
- *bin_args = (u32 *)tmp_buf;
+ if (data->get_bin_args) {
+ if (num_args)
+ tmp_buf = buffers->bin_args;
+ tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
+ data->bin_args = (u32 *)tmp_buf;
}
+ if (data->get_buf)
+ data->buf = buffers->buf;
+
for (i = 0; i < fmt_size; i++) {
if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
err = -EINVAL;
@@ -1015,31 +1030,33 @@ nocopy_fmt:
err = 0;
out:
if (err)
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(data);
return err;
}
BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
- const void *, data, u32, data_len)
+ const void *, args, u32, data_len)
{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ };
int err, num_args;
- u32 *bin_args;
if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
- (data_len && !data))
+ (data_len && !args))
return -EINVAL;
num_args = data_len / 8;
/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
* can safely give an unbounded size.
*/
- err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args);
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
if (err < 0)
return err;
- err = bstr_printf(str, str_size, fmt, bin_args);
+ err = bstr_printf(str, str_size, fmt, data.bin_args);
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(&data);
return err + 1;
}
@@ -1169,7 +1186,7 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
ret = -ENOMEM;
goto out;
}
- t->value = (void *)timer - map->timer_off;
+ t->value = (void *)timer - map->record->timer_off;
t->map = map;
t->prog = NULL;
rcu_assign_pointer(t->callback_fn, NULL);
@@ -1398,7 +1415,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
#define DYNPTR_SIZE_MASK 0xFFFFFF
#define DYNPTR_RDONLY_BIT BIT(31)
-static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_RDONLY_BIT;
}
@@ -1408,7 +1425,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
ptr->size |= type << DYNPTR_TYPE_SHIFT;
}
-u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_SIZE_MASK;
}
@@ -1432,7 +1449,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
memset(ptr, 0, sizeof(*ptr));
}
-static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
{
u32 size = bpf_dynptr_get_size(ptr);
@@ -1477,7 +1494,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
};
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
u32, offset, u64, flags)
{
int err;
@@ -1489,7 +1506,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src
if (err)
return err;
- memcpy(dst, src->data + src->offset + offset, len);
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst, src->data + src->offset + offset, len);
return 0;
}
@@ -1500,12 +1521,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_PTR_TO_DYNPTR,
+ .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
-BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
u32, len, u64, flags)
{
int err;
@@ -1517,7 +1538,11 @@ BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *,
if (err)
return err;
- memcpy(dst->data + dst->offset + offset, src, len);
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst->data + dst->offset + offset, src, len);
return 0;
}
@@ -1526,14 +1551,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
.func = bpf_dynptr_write,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_DYNPTR,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
.arg5_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
{
int err;
@@ -1554,7 +1579,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = {
.func = bpf_dynptr_data,
.gpl_only = false,
.ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
- .arg1_type = ARG_PTR_TO_DYNPTR,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
};
@@ -1663,6 +1688,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_dynptr_write_proto;
case BPF_FUNC_dynptr_data:
return &bpf_dynptr_data_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_cgrp_storage_get:
+ return &bpf_cgrp_storage_get_proto;
+ case BPF_FUNC_cgrp_storage_delete:
+ return &bpf_cgrp_storage_delete_proto;
+#endif
default:
break;
}
@@ -1700,20 +1731,495 @@ bpf_base_func_proto(enum bpf_func_id func_id)
}
}
-BTF_SET8_START(tracing_btf_ids)
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct list_head *head = list_head, *orig_head = list_head;
+
+ BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
+ BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+
+ /* Do the actual list draining outside the lock to not hold the lock for
+ * too long, and also prevent deadlocks if tracing programs end up
+ * executing on entry/exit of functions called inside the critical
+ * section, and end up doing map ops that call bpf_list_head_free for
+ * the same map value again.
+ */
+ __bpf_spin_lock_irqsave(spin_lock);
+ if (!head->next || list_empty(head))
+ goto unlock;
+ head = head->next;
+unlock:
+ INIT_LIST_HEAD(orig_head);
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ while (head != orig_head) {
+ void *obj = head;
+
+ obj -= field->graph_root.node_offset;
+ head = head->next;
+ /* The contained type can also have resources, including a
+ * bpf_list_head which needs to be freed.
+ */
+ bpf_obj_free_fields(field->graph_root.value_rec, obj);
+ /* bpf_mem_free requires migrate_disable(), since we can be
+ * called from map free path as well apart from BPF program (as
+ * part of map ops doing bpf_obj_free_fields).
+ */
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, obj);
+ migrate_enable();
+ }
+}
+
+/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
+ * 'rb_node *', so field name of rb_node within containing struct is not
+ * needed.
+ *
+ * Since bpf_rb_tree's node type has a corresponding struct btf_field with
+ * graph_root.node_offset, it's not necessary to know field name
+ * or type of node struct
+ */
+#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
+ for (pos = rb_first_postorder(root); \
+ pos && ({ n = rb_next_postorder(pos); 1; }); \
+ pos = n)
+
+void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct rb_root_cached orig_root, *root = rb_root;
+ struct rb_node *pos, *n;
+ void *obj;
+
+ BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
+ BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
+
+ __bpf_spin_lock_irqsave(spin_lock);
+ orig_root = *root;
+ *root = RB_ROOT_CACHED;
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
+ obj = pos;
+ obj -= field->graph_root.node_offset;
+
+ bpf_obj_free_fields(field->graph_root.value_rec, obj);
+
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, obj);
+ migrate_enable();
+ }
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ u64 size = local_type_id__k;
+ void *p;
+
+ p = bpf_mem_alloc(&bpf_global_ma, size);
+ if (!p)
+ return NULL;
+ if (meta)
+ bpf_obj_init(meta->field_offs, p);
+ return p;
+}
+
+__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ void *p = p__alloc;
+
+ if (meta)
+ bpf_obj_free_fields(meta->record, p);
+ bpf_mem_free(&bpf_global_ma, p);
+}
+
+static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
+{
+ struct list_head *n = (void *)node, *h = (void *)head;
+
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (unlikely(!n->next))
+ INIT_LIST_HEAD(n);
+ tail ? list_add_tail(n, h) : list_add(n, h);
+}
+
+__bpf_kfunc void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
+{
+ return __bpf_list_add(node, head, false);
+}
+
+__bpf_kfunc void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
+{
+ return __bpf_list_add(node, head, true);
+}
+
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+{
+ struct list_head *n, *h = (void *)head;
+
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (list_empty(h))
+ return NULL;
+ n = tail ? h->prev : h->next;
+ list_del_init(n);
+ return (struct bpf_list_node *)n;
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, false);
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, true);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+ struct bpf_rb_node *node)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+ struct rb_node *n = (struct rb_node *)node;
+
+ rb_erase_cached(n, r);
+ RB_CLEAR_NODE(n);
+ return (struct bpf_rb_node *)n;
+}
+
+/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
+ * program
+ */
+static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ void *less)
+{
+ struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
+ bpf_callback_t cb = (bpf_callback_t)less;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ while (*link) {
+ parent = *link;
+ if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node((struct rb_node *)node, parent, link);
+ rb_insert_color_cached((struct rb_node *)node,
+ (struct rb_root_cached *)root, leftmost);
+}
+
+__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b))
+{
+ __bpf_rbtree_add(root, node, (void *)less);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)rb_first_cached(r);
+}
+
+/**
+ * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
+ * kfunc which is not stored in a map as a kptr, must be released by calling
+ * bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
+{
+ return get_task_struct(p);
+}
+
+/**
+ * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task
+ * acquired by this kfunc which is not stored in a map as a kptr, must be
+ * released by calling bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
+{
+ /* For the time being this function returns NULL, as it's not currently
+ * possible to safely acquire a reference to a task with RCU protection
+ * using get_task_struct() and put_task_struct(). This is due to the
+ * slightly odd mechanics of p->rcu_users, and how task RCU protection
+ * works.
+ *
+ * A struct task_struct is refcounted by two different refcount_t
+ * fields:
+ *
+ * 1. p->usage: The "true" refcount field which tracks a task's
+ * lifetime. The task is freed as soon as this
+ * refcount drops to 0.
+ *
+ * 2. p->rcu_users: An "RCU users" refcount field which is statically
+ * initialized to 2, and is co-located in a union with
+ * a struct rcu_head field (p->rcu). p->rcu_users
+ * essentially encapsulates a single p->usage
+ * refcount, and when p->rcu_users goes to 0, an RCU
+ * callback is scheduled on the struct rcu_head which
+ * decrements the p->usage refcount.
+ *
+ * There are two important implications to this task refcounting logic
+ * described above. The first is that
+ * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as
+ * after the refcount goes to 0, the RCU callback being scheduled will
+ * cause the memory backing the refcount to again be nonzero due to the
+ * fields sharing a union. The other is that we can't rely on RCU to
+ * guarantee that a task is valid in a BPF program. This is because a
+ * task could have already transitioned to being in the TASK_DEAD
+ * state, had its rcu_users refcount go to 0, and its rcu callback
+ * invoked in which it drops its single p->usage reference. At this
+ * point the task will be freed as soon as the last p->usage reference
+ * goes to 0, without waiting for another RCU gp to elapse. The only
+ * way that a BPF program can guarantee that a task is valid is in this
+ * scenario is to hold a p->usage refcount itself.
+ *
+ * Until we're able to resolve this issue, either by pulling
+ * p->rcu_users and p->rcu out of the union, or by getting rid of
+ * p->usage and just using p->rcu_users for refcounting, we'll just
+ * return NULL here.
+ */
+ return NULL;
+}
+
+/**
+ * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task
+ * kptr acquired by this kfunc which is not subsequently stored in a map, must
+ * be released by calling bpf_task_release().
+ * @pp: A pointer to a task kptr on which a reference is being acquired.
+ */
+__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
+{
+ /* We must return NULL here until we have clarity on how to properly
+ * leverage RCU for ensuring a task's lifetime. See the comment above
+ * in bpf_task_acquire_not_zero() for more details.
+ */
+ return NULL;
+}
+
+/**
+ * bpf_task_release - Release the reference acquired on a task.
+ * @p: The task on which a reference is being released.
+ */
+__bpf_kfunc void bpf_task_release(struct task_struct *p)
+{
+ if (!p)
+ return;
+
+ put_task_struct(p);
+}
+
+#ifdef CONFIG_CGROUPS
+/**
+ * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
+ * this kfunc which is not stored in a map as a kptr, must be released by
+ * calling bpf_cgroup_release().
+ * @cgrp: The cgroup on which a reference is being acquired.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
+{
+ cgroup_get(cgrp);
+ return cgrp;
+}
+
+/**
+ * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup
+ * kptr acquired by this kfunc which is not subsequently stored in a map, must
+ * be released by calling bpf_cgroup_release().
+ * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
+{
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+ /* Another context could remove the cgroup from the map and release it
+ * at any time, including after we've done the lookup above. This is
+ * safe because we're in an RCU read region, so the cgroup is
+ * guaranteed to remain valid until at least the rcu_read_unlock()
+ * below.
+ */
+ cgrp = READ_ONCE(*cgrpp);
+
+ if (cgrp && !cgroup_tryget(cgrp))
+ /* If the cgroup had been removed from the map and freed as
+ * described above, cgroup_tryget() will return false. The
+ * cgroup will be freed at some point after the current RCU gp
+ * has ended, so just return NULL to the user.
+ */
+ cgrp = NULL;
+ rcu_read_unlock();
+
+ return cgrp;
+}
+
+/**
+ * bpf_cgroup_release - Release the reference acquired on a cgroup.
+ * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
+ * not be freed until the current grace period has ended, even if its refcount
+ * drops to 0.
+ * @cgrp: The cgroup on which a reference is being released.
+ */
+__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
+{
+ if (!cgrp)
+ return;
+
+ cgroup_put(cgrp);
+}
+
+/**
+ * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
+ * array. A cgroup returned by this kfunc which is not subsequently stored in a
+ * map, must be released by calling bpf_cgroup_release().
+ * @cgrp: The cgroup for which we're performing a lookup.
+ * @level: The level of ancestor to look up.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
+{
+ struct cgroup *ancestor;
+
+ if (level > cgrp->level || level < 0)
+ return NULL;
+
+ ancestor = cgrp->ancestors[level];
+ cgroup_get(ancestor);
+ return ancestor;
+}
+#endif /* CONFIG_CGROUPS */
+
+/**
+ * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
+ * in the root pid namespace idr. If a task is returned, it must either be
+ * stored in a map, or released with bpf_task_release().
+ * @pid: The pid of the task being looked up.
+ */
+__bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p)
+ bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
+{
+ return obj;
+}
+
+__bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
+{
+ return obj__ign;
+}
+
+__bpf_kfunc void bpf_rcu_read_lock(void)
+{
+ rcu_read_lock();
+}
+
+__bpf_kfunc void bpf_rcu_read_unlock(void)
+{
+ rcu_read_unlock();
+}
+
+__diag_pop();
+
+BTF_SET8_START(generic_btf_ids)
#ifdef CONFIG_KEXEC_CORE
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
-BTF_SET8_END(tracing_btf_ids)
+BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_list_push_front)
+BTF_ID_FLAGS(func, bpf_list_push_back)
+BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_rbtree_add)
+BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
+
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
+#endif
+BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_SET8_END(generic_btf_ids)
-static const struct btf_kfunc_id_set tracing_kfunc_set = {
+static const struct btf_kfunc_id_set generic_kfunc_set = {
.owner = THIS_MODULE,
- .set = &tracing_btf_ids,
+ .set = &generic_btf_ids,
+};
+
+
+BTF_ID_LIST(generic_dtor_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(func, bpf_task_release)
+#ifdef CONFIG_CGROUPS
+BTF_ID(struct, cgroup)
+BTF_ID(func, bpf_cgroup_release)
+#endif
+
+BTF_SET8_START(common_btf_ids)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
+BTF_ID_FLAGS(func, bpf_rdonly_cast)
+BTF_ID_FLAGS(func, bpf_rcu_read_lock)
+BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_SET8_END(common_btf_ids)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &common_btf_ids,
};
static int __init kfunc_init(void)
{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set);
+ int ret;
+ const struct btf_id_dtor_kfunc generic_dtors[] = {
+ {
+ .btf_id = generic_dtor_ids[0],
+ .kfunc_btf_id = generic_dtor_ids[1]
+ },
+#ifdef CONFIG_CGROUPS
+ {
+ .btf_id = generic_dtor_ids[2],
+ .kfunc_btf_id = generic_dtor_ids[3]
+ },
+#endif
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
+ ARRAY_SIZE(generic_dtors),
+ THIS_MODULE);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
}
late_initcall(kfunc_init);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4f841e16779e..9948b542a470 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime;
inode->i_ctime = inode->i_atime;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
return inode;
}
@@ -152,7 +152,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
dir->i_ctime = dir->i_mtime;
}
-static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct inode *inode;
@@ -382,7 +382,7 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
return simple_lookup(dir, dentry, flags);
}
-static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *target)
{
char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
@@ -559,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(&init_user_ns, inode, MAY_READ);
+ int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 098cf336fae6..e90d9f63edc5 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -151,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return -EINVAL;
if (unlikely((flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 135205d0d560..38136ec4e095 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
struct bpf_map *inner_map, *inner_map_meta;
u32 inner_map_meta_size;
struct fd f;
+ int ret;
f = fdget(inner_map_ufd);
inner_map = __bpf_map_get(f);
@@ -20,18 +21,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Does not support >1 level map-in-map */
if (inner_map->inner_map_meta) {
- fdput(f);
- return ERR_PTR(-EINVAL);
+ ret = -EINVAL;
+ goto put;
}
if (!inner_map->ops->map_meta_equal) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
- }
-
- if (map_value_has_spin_lock(inner_map)) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
+ ret = -ENOTSUPP;
+ goto put;
}
inner_map_meta_size = sizeof(*inner_map_meta);
@@ -41,8 +37,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
if (!inner_map_meta) {
- fdput(f);
- return ERR_PTR(-ENOMEM);
+ ret = -ENOMEM;
+ goto put;
}
inner_map_meta->map_type = inner_map->map_type;
@@ -50,9 +46,33 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
- inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
- inner_map_meta->timer_off = inner_map->timer_off;
- inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map);
+
+ inner_map_meta->record = btf_record_dup(inner_map->record);
+ if (IS_ERR(inner_map_meta->record)) {
+ /* btf_record_dup returns NULL or valid pointer in case of
+ * invalid/empty/valid, but ERR_PTR in case of errors. During
+ * equality NULL or IS_ERR is equivalent.
+ */
+ ret = PTR_ERR(inner_map_meta->record);
+ goto free;
+ }
+ if (inner_map_meta->record) {
+ struct btf_field_offs *field_offs;
+ /* If btf_record is !IS_ERR_OR_NULL, then field_offs is always
+ * valid.
+ */
+ field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN);
+ if (!field_offs) {
+ ret = -ENOMEM;
+ goto free_rec;
+ }
+ inner_map_meta->field_offs = field_offs;
+ }
+ /* Note: We must use the same BTF, as we also used btf_record_dup above
+ * which relies on BTF being same for both maps, as some members like
+ * record->fields.list_head have pointers like value_rec pointing into
+ * inner_map->btf.
+ */
if (inner_map->btf) {
btf_get(inner_map->btf);
inner_map_meta->btf = inner_map->btf;
@@ -68,11 +88,19 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
fdput(f);
return inner_map_meta;
+free_rec:
+ btf_record_free(inner_map_meta->record);
+free:
+ kfree(inner_map_meta);
+put:
+ fdput(f);
+ return ERR_PTR(ret);
}
void bpf_map_meta_free(struct bpf_map *map_meta)
{
- bpf_map_free_kptr_off_tab(map_meta);
+ kfree(map_meta->field_offs);
+ bpf_map_free_record(map_meta);
btf_put(map_meta->btf);
kfree(map_meta);
}
@@ -84,9 +112,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
- meta0->timer_off == meta1->timer_off &&
meta0->map_flags == meta1->map_flags &&
- bpf_map_equal_kptr_off_tab(meta0, meta1);
+ btf_record_equal(meta0->record, meta1->record);
}
void *bpf_map_fd_get_ptr(struct bpf_map *map,
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 4901fa1048cd..5fcdacbb8439 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -71,7 +71,7 @@ static int bpf_mem_cache_idx(size_t size)
if (size <= 192)
return size_index[(size - 1) / 8] - 1;
- return fls(size - 1) - 1;
+ return fls(size - 1) - 2;
}
#define NUM_CACHES 11
@@ -143,7 +143,7 @@ static void *__alloc(struct bpf_mem_cache *c, int node)
return obj;
}
- return kmalloc_node(c->unit_size, flags, node);
+ return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);
}
static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
@@ -171,9 +171,24 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
memcg = get_memcg(c);
old_memcg = set_active_memcg(memcg);
for (i = 0; i < cnt; i++) {
- obj = __alloc(c, node);
- if (!obj)
- break;
+ /*
+ * free_by_rcu is only manipulated by irq work refill_work().
+ * IRQ works on the same CPU are called sequentially, so it is
+ * safe to use __llist_del_first() here. If alloc_bulk() is
+ * invoked by the initial prefill, there will be no running
+ * refill_work(), so __llist_del_first() is fine as well.
+ *
+ * In most cases, objects on free_by_rcu are from the same CPU.
+ * If some objects come from other CPUs, it doesn't incur any
+ * harm because NUMA_NO_NODE means the preference for current
+ * numa node and it is not a guarantee.
+ */
+ obj = __llist_del_first(&c->free_by_rcu);
+ if (!obj) {
+ obj = __alloc(c, node);
+ if (!obj)
+ break;
+ }
if (IS_ENABLED(CONFIG_PREEMPT_RT))
/* In RT irq_work runs in per-cpu kthread, so disable
* interrupts to avoid preemption and interrupts and
@@ -222,9 +237,13 @@ static void __free_rcu(struct rcu_head *head)
static void __free_rcu_tasks_trace(struct rcu_head *head)
{
- struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
-
- call_rcu(&c->rcu, __free_rcu);
+ /* If RCU Tasks Trace grace period implies RCU grace period,
+ * there is no need to invoke call_rcu().
+ */
+ if (rcu_trace_implies_rcu_gp())
+ __free_rcu(head);
+ else
+ call_rcu(head, __free_rcu);
}
static void enque_to_free(struct bpf_mem_cache *c, void *obj)
@@ -253,8 +272,9 @@ static void do_call_rcu(struct bpf_mem_cache *c)
*/
__llist_add(llnode, &c->waiting_for_gp);
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
- * Then use call_rcu() to wait for normal progs to finish
- * and finally do free_one() on each element.
+ * If RCU Tasks Trace grace period implies RCU grace period, free
+ * these elements directly, else use call_rcu() to wait for normal
+ * progs to finish and finally do free_one() on each element.
*/
call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
}
@@ -375,7 +395,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
unit_size = size;
#ifdef CONFIG_MEMCG_KMEM
- objcg = get_obj_cgroup_from_current();
+ if (memcg_bpf_enabled())
+ objcg = get_obj_cgroup_from_current();
#endif
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(pc, cpu);
@@ -444,9 +465,17 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
{
/* waiting_for_gp lists was drained, but __free_rcu might
* still execute. Wait for it now before we freeing percpu caches.
+ *
+ * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
+ * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
+ * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
+ * so if call_rcu(head, __free_rcu) is skipped due to
+ * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
+ * using rcu_trace_implies_rcu_gp() as well.
*/
rcu_barrier_tasks_trace();
- rcu_barrier();
+ if (!rcu_trace_implies_rcu_gp())
+ rcu_barrier();
free_mem_alloc_no_barrier(ma);
}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 13e4efc971e6..0c85e06f7ea7 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -41,7 +41,7 @@ struct bpf_offload_dev {
struct bpf_offload_netdev {
struct rhash_head l;
struct net_device *netdev;
- struct bpf_offload_dev *offdev;
+ struct bpf_offload_dev *offdev; /* NULL when bound-only */
struct list_head progs;
struct list_head maps;
struct list_head offdev_netdevs;
@@ -56,7 +56,6 @@ static const struct rhashtable_params offdevs_params = {
};
static struct rhashtable offdevs;
-static bool offdevs_inited;
static int bpf_dev_offload_check(struct net_device *netdev)
{
@@ -72,58 +71,218 @@ bpf_offload_find_netdev(struct net_device *netdev)
{
lockdep_assert_held(&bpf_devs_lock);
- if (!offdevs_inited)
- return NULL;
return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
}
-int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
{
struct bpf_offload_netdev *ondev;
- struct bpf_prog_offload *offload;
int err;
- if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
- attr->prog_type != BPF_PROG_TYPE_XDP)
- return -EINVAL;
+ ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
+ if (!ondev)
+ return -ENOMEM;
- if (attr->prog_flags)
- return -EINVAL;
+ ondev->netdev = netdev;
+ ondev->offdev = offdev;
+ INIT_LIST_HEAD(&ondev->progs);
+ INIT_LIST_HEAD(&ondev->maps);
+
+ err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
+ if (err) {
+ netdev_warn(netdev, "failed to register for BPF offload\n");
+ goto err_free;
+ }
+
+ if (offdev)
+ list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ return 0;
+
+err_free:
+ kfree(ondev);
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_prog_offload *offload = prog->aux->offload;
+
+ if (offload->dev_state)
+ offload->offdev->ops->destroy(prog);
+
+ list_del_init(&offload->offloads);
+ kfree(offload);
+ prog->aux->offload = NULL;
+}
+
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev, *altdev = NULL;
+ struct bpf_offloaded_map *offmap, *mtmp;
+ struct bpf_prog_offload *offload, *ptmp;
+
+ ASSERT_RTNL();
+
+ ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+ if (WARN_ON(!ondev))
+ return;
+
+ WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
+
+ /* Try to move the objects to another netdev of the device */
+ if (offdev) {
+ list_del(&ondev->offdev_netdevs);
+ altdev = list_first_entry_or_null(&offdev->netdevs,
+ struct bpf_offload_netdev,
+ offdev_netdevs);
+ }
+
+ if (altdev) {
+ list_for_each_entry(offload, &ondev->progs, offloads)
+ offload->netdev = altdev->netdev;
+ list_splice_init(&ondev->progs, &altdev->progs);
+
+ list_for_each_entry(offmap, &ondev->maps, offloads)
+ offmap->netdev = altdev->netdev;
+ list_splice_init(&ondev->maps, &altdev->maps);
+ } else {
+ list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+ __bpf_prog_offload_destroy(offload->prog);
+ list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+ __bpf_map_offload_destroy(offmap);
+ }
+
+ WARN_ON(!list_empty(&ondev->progs));
+ WARN_ON(!list_empty(&ondev->maps));
+ kfree(ondev);
+}
+
+static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev;
+ struct bpf_prog_offload *offload;
+ int err;
offload = kzalloc(sizeof(*offload), GFP_USER);
if (!offload)
return -ENOMEM;
offload->prog = prog;
+ offload->netdev = netdev;
- offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
- attr->prog_ifindex);
- err = bpf_dev_offload_check(offload->netdev);
- if (err)
- goto err_maybe_put;
-
- down_write(&bpf_devs_lock);
ondev = bpf_offload_find_netdev(offload->netdev);
if (!ondev) {
- err = -EINVAL;
- goto err_unlock;
+ if (bpf_prog_is_offloaded(prog->aux)) {
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ /* When only binding to the device, explicitly
+ * create an entry in the hashtable.
+ */
+ err = __bpf_offload_dev_netdev_register(NULL, offload->netdev);
+ if (err)
+ goto err_free;
+ ondev = bpf_offload_find_netdev(offload->netdev);
}
offload->offdev = ondev->offdev;
prog->aux->offload = offload;
list_add_tail(&offload->offloads, &ondev->progs);
- dev_put(offload->netdev);
- up_write(&bpf_devs_lock);
return 0;
-err_unlock:
- up_write(&bpf_devs_lock);
-err_maybe_put:
- if (offload->netdev)
- dev_put(offload->netdev);
+err_free:
kfree(offload);
return err;
}
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net_device *netdev;
+ int err;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex);
+ if (!netdev)
+ return -EINVAL;
+
+ err = bpf_dev_offload_check(netdev);
+ if (err)
+ goto out;
+
+ prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY);
+
+ down_write(&bpf_devs_lock);
+ err = __bpf_prog_dev_bound_init(prog, netdev);
+ up_write(&bpf_devs_lock);
+
+out:
+ dev_put(netdev);
+ return err;
+}
+
+int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog)
+{
+ int err;
+
+ if (!bpf_prog_is_dev_bound(old_prog->aux))
+ return 0;
+
+ if (bpf_prog_is_offloaded(old_prog->aux))
+ return -EINVAL;
+
+ new_prog->aux->dev_bound = old_prog->aux->dev_bound;
+ new_prog->aux->offload_requested = old_prog->aux->offload_requested;
+
+ down_write(&bpf_devs_lock);
+ if (!old_prog->aux->offload) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev);
+
+out:
+ up_write(&bpf_devs_lock);
+ return err;
+}
+
int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload;
@@ -209,27 +368,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
up_read(&bpf_devs_lock);
}
-static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog)
{
- struct bpf_prog_offload *offload = prog->aux->offload;
-
- if (offload->dev_state)
- offload->offdev->ops->destroy(prog);
-
- /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
- bpf_prog_free_id(prog, true);
-
- list_del_init(&offload->offloads);
- kfree(offload);
- prog->aux->offload = NULL;
-}
+ struct bpf_offload_netdev *ondev;
+ struct net_device *netdev;
-void bpf_prog_offload_destroy(struct bpf_prog *prog)
-{
+ rtnl_lock();
down_write(&bpf_devs_lock);
- if (prog->aux->offload)
+ if (prog->aux->offload) {
+ list_del_init(&prog->aux->offload->offloads);
+
+ netdev = prog->aux->offload->netdev;
__bpf_prog_offload_destroy(prog);
+
+ ondev = bpf_offload_find_netdev(netdev);
+ if (!ondev->offdev && list_empty(&ondev->progs))
+ __bpf_offload_dev_netdev_unregister(NULL, netdev);
+ }
up_write(&bpf_devs_lock);
+ rtnl_unlock();
}
static int bpf_prog_offload_translate(struct bpf_prog *prog)
@@ -343,22 +500,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
-static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
- enum bpf_netdev_command cmd)
-{
- struct netdev_bpf data = {};
- struct net_device *netdev;
-
- ASSERT_RTNL();
-
- data.command = cmd;
- data.offmap = offmap;
- /* Caller must make sure netdev is valid */
- netdev = offmap->netdev;
-
- return netdev->netdev_ops->ndo_bpf(netdev, &data);
-}
-
struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
struct net *net = current->nsproxy->net_ns;
@@ -408,15 +549,6 @@ err_unlock:
return ERR_PTR(err);
}
-static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
-{
- WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
- /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
- bpf_map_free_id(&offmap->map, true);
- list_del_init(&offmap->offloads);
- offmap->netdev = NULL;
-}
-
void bpf_map_offload_map_free(struct bpf_map *map)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
@@ -576,12 +708,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_match);
+bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
+{
+ bool ret;
+
+ if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux))
+ return false;
+
+ down_read(&bpf_devs_lock);
+ ret = lhs->aux->offload && rhs->aux->offload &&
+ lhs->aux->offload->netdev &&
+ lhs->aux->offload->netdev == rhs->aux->offload->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
{
struct bpf_offloaded_map *offmap;
bool ret;
- if (!bpf_map_is_dev_bound(map))
+ if (!bpf_map_is_offloaded(map))
return bpf_map_offload_neutral(map);
offmap = map_to_offmap(map);
@@ -595,32 +743,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev;
int err;
- ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
- if (!ondev)
- return -ENOMEM;
-
- ondev->netdev = netdev;
- ondev->offdev = offdev;
- INIT_LIST_HEAD(&ondev->progs);
- INIT_LIST_HEAD(&ondev->maps);
-
down_write(&bpf_devs_lock);
- err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
- if (err) {
- netdev_warn(netdev, "failed to register for BPF offload\n");
- goto err_unlock_free;
- }
-
- list_add(&ondev->offdev_netdevs, &offdev->netdevs);
- up_write(&bpf_devs_lock);
- return 0;
-
-err_unlock_free:
+ err = __bpf_offload_dev_netdev_register(offdev, netdev);
up_write(&bpf_devs_lock);
- kfree(ondev);
return err;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
@@ -628,43 +755,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
struct net_device *netdev)
{
- struct bpf_offload_netdev *ondev, *altdev;
- struct bpf_offloaded_map *offmap, *mtmp;
- struct bpf_prog_offload *offload, *ptmp;
-
- ASSERT_RTNL();
-
down_write(&bpf_devs_lock);
- ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
- if (WARN_ON(!ondev))
- goto unlock;
-
- WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
- list_del(&ondev->offdev_netdevs);
-
- /* Try to move the objects to another netdev of the device */
- altdev = list_first_entry_or_null(&offdev->netdevs,
- struct bpf_offload_netdev,
- offdev_netdevs);
- if (altdev) {
- list_for_each_entry(offload, &ondev->progs, offloads)
- offload->netdev = altdev->netdev;
- list_splice_init(&ondev->progs, &altdev->progs);
-
- list_for_each_entry(offmap, &ondev->maps, offloads)
- offmap->netdev = altdev->netdev;
- list_splice_init(&ondev->maps, &altdev->maps);
- } else {
- list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
- __bpf_prog_offload_destroy(offload->prog);
- list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
- __bpf_map_offload_destroy(offmap);
- }
-
- WARN_ON(!list_empty(&ondev->progs));
- WARN_ON(!list_empty(&ondev->maps));
- kfree(ondev);
-unlock:
+ __bpf_offload_dev_netdev_unregister(offdev, netdev);
up_write(&bpf_devs_lock);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
@@ -673,18 +765,6 @@ struct bpf_offload_dev *
bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
- int err;
-
- down_write(&bpf_devs_lock);
- if (!offdevs_inited) {
- err = rhashtable_init(&offdevs, &offdevs_params);
- if (err) {
- up_write(&bpf_devs_lock);
- return ERR_PTR(err);
- }
- offdevs_inited = true;
- }
- up_write(&bpf_devs_lock);
offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);
if (!offdev)
@@ -710,3 +790,67 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
return offdev->priv;
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
+
+void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+ struct bpf_offload_netdev *ondev;
+
+ ASSERT_RTNL();
+
+ down_write(&bpf_devs_lock);
+ ondev = bpf_offload_find_netdev(dev);
+ if (ondev && !ondev->offdev)
+ __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev);
+ up_write(&bpf_devs_lock);
+}
+
+int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
+ struct bpf_prog_aux *prog_aux)
+{
+ if (!bpf_prog_is_dev_bound(prog_aux)) {
+ bpf_log(log, "metadata kfuncs require device-bound program\n");
+ return -EINVAL;
+ }
+
+ if (bpf_prog_is_offloaded(prog_aux)) {
+ bpf_log(log, "metadata kfuncs can't be offloaded\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
+{
+ const struct xdp_metadata_ops *ops;
+ void *p = NULL;
+
+ /* We don't hold bpf_devs_lock while resolving several
+ * kfuncs and can race with the unregister_netdevice().
+ * We rely on bpf_dev_bound_match() check at attach
+ * to render this program unusable.
+ */
+ down_read(&bpf_devs_lock);
+ if (!prog->aux->offload)
+ goto out;
+
+ ops = prog->aux->offload->netdev->xdp_metadata_ops;
+ if (!ops)
+ goto out;
+
+ if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP))
+ p = ops->xmo_rx_timestamp;
+ else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH))
+ p = ops->xmo_rx_hash;
+out:
+ up_read(&bpf_devs_lock);
+
+ return p;
+}
+
+static int __init bpf_offload_init(void)
+{
+ return rhashtable_init(&offdevs, &offdevs_params);
+}
+
+late_initcall(bpf_offload_init);
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 5106b5372f0c..b56f9f3314fd 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -3,7 +3,11 @@
#include <linux/init.h>
#include <linux/module.h>
#include "bpf_preload.h"
-#include "iterators/iterators.lskel.h"
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#include "iterators/iterators.lskel-little-endian.h"
+#else
+#include "iterators/iterators.lskel-big-endian.h"
+#endif
static struct bpf_link *maps_link, *progs_link;
static struct iterators_bpf *skel;
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index 6762b1260f2f..8937dc6bc8d0 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -35,20 +35,22 @@ endif
.PHONY: all clean
-all: iterators.lskel.h
+all: iterators.lskel-little-endian.h
+
+big: iterators.lskel-big-endian.h
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) iterators
-iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
+iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL)
$(call msg,GEN-SKEL,$@)
$(Q)$(BPFTOOL) gen skeleton -L $< > $@
-
-$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
+$(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
$(call msg,BPF,$@)
- $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \
+ $(Q)mkdir -p $(@D)
+ $(Q)$(CLANG) -g -O2 -target bpf -m$* $(INCLUDES) \
-c $(filter %.c,$^) -o $@ && \
$(LLVM_STRIP) -g $@
diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README
index 7fd6d39a9ad2..98e7c90ea012 100644
--- a/kernel/bpf/preload/iterators/README
+++ b/kernel/bpf/preload/iterators/README
@@ -1,4 +1,7 @@
WARNING:
-If you change "iterators.bpf.c" do "make -j" in this directory to rebuild "iterators.skel.h".
+If you change "iterators.bpf.c" do "make -j" in this directory to
+rebuild "iterators.lskel-little-endian.h". Then, on a big-endian
+machine, do "make -j big" in this directory to rebuild
+"iterators.lskel-big-endian.h". Commit both resulting headers.
Make sure to have clang 10 installed.
See Documentation/bpf/bpf_devel_QA.rst
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
new file mode 100644
index 000000000000..ebdc6c0cdb70
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6008;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\
+\0\0\0\x18\0\0\0\0\0\0\x04\x1c\0\0\x04\x1c\0\0\x05\x18\0\0\0\0\x02\0\0\0\0\0\0\
+\x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\
+\0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\
+\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\
+\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc2\x04\0\0\x03\0\0\0\x18\0\0\0\
+\xd0\0\0\0\x09\0\0\0\0\0\0\0\xd4\0\0\0\x0b\0\0\0\x40\0\0\0\xdf\0\0\0\x0b\0\0\0\
+\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe7\x07\0\0\0\0\0\0\0\0\0\0\xf0\x08\0\0\
+\0\0\0\0\x0c\0\0\0\xf6\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xb3\x04\0\0\x03\0\
+\0\0\x18\0\0\x01\xbb\0\0\0\x0e\0\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x01\
+\xc3\0\0\0\x0e\0\0\0\xa0\0\0\x01\xcf\x08\0\0\0\0\0\0\x0f\0\0\x01\xd5\x01\0\0\0\
+\0\0\0\x04\0\0\0\x20\0\0\x01\xe2\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\
+\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xe7\x01\0\0\0\0\0\0\x04\0\0\
+\0\x20\0\0\0\0\x02\0\0\0\0\0\0\x14\0\0\x02\x4b\x04\0\0\x02\0\0\0\x10\0\0\0\x13\
+\0\0\0\x03\0\0\0\0\0\0\x02\x5e\0\0\0\x15\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x18\
+\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x13\0\0\x02\x63\x0c\0\0\x01\0\0\
+\0\x16\0\0\x02\xaf\x04\0\0\x01\0\0\0\x08\0\0\x02\xb8\0\0\0\x19\0\0\0\0\0\0\0\0\
+\x02\0\0\0\0\0\0\x1a\0\0\x03\x09\x04\0\0\x06\0\0\0\x38\0\0\x01\xbb\0\0\0\x0e\0\
+\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x03\x16\0\0\0\x1b\0\0\0\xc0\0\0\x03\
+\x27\0\0\0\x15\0\0\x01\0\0\0\x03\x30\0\0\0\x1d\0\0\x01\x40\0\0\x03\x3a\0\0\0\
+\x1e\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x1c\0\0\0\0\x0a\0\0\0\0\0\0\x10\0\0\0\
+\0\x02\0\0\0\0\0\0\x1f\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\x03\x84\x04\0\0\x02\0\0\
+\0\x08\0\0\x03\x92\0\0\0\x0e\0\0\0\0\0\0\x03\x9b\0\0\0\x0e\0\0\0\x20\0\0\x03\
+\x3a\x04\0\0\x03\0\0\0\x18\0\0\x03\xa5\0\0\0\x1b\0\0\0\0\0\0\x03\xad\0\0\0\x21\
+\0\0\0\x40\0\0\x03\xb3\0\0\0\x23\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x22\0\0\0\0\
+\x02\0\0\0\0\0\0\x24\0\0\x03\xb7\x04\0\0\x01\0\0\0\x04\0\0\x03\xc2\0\0\0\x0e\0\
+\0\0\0\0\0\x04\x2b\x04\0\0\x01\0\0\0\x04\0\0\x04\x34\0\0\0\x0e\0\0\0\0\0\0\0\0\
+\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\x04\xaa\x0e\0\0\0\0\0\0\
+\x25\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\x04\
+\xbe\x0e\0\0\0\0\0\0\x27\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\
+\0\0\0\x20\0\0\x04\xd4\x0e\0\0\0\0\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\
+\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\x04\xe9\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\
+\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\0\x0e\0\0\0\0\0\0\x2d\
+\0\0\0\x01\0\0\x05\x08\x0f\0\0\x04\0\0\0\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\
+\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\
+\0\x11\0\0\x05\x10\x0f\0\0\x01\0\0\0\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\x62\x70\
+\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\
+\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\
+\x30\0\x2f\x68\x6f\x6d\x65\x2f\x69\x69\x69\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\
+\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\
+\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\
+\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\
+\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\
+\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\
+\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\
+\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\
+\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\
+\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\
+\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\
+\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\
+\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\
+\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\
+\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\
+\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\
+\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\
+\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\
+\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\
+\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\
+\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\
+\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\
+\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\
+\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\
+\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\
+\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\
+\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\
+\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\
+\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\
+\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
+\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\
+\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\
+\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\
+\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\
+\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\
+\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\
+\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\
+\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\
+\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\
+\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\
+\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\
+\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\
+\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\
+\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\
+\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\
+\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\
+\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\
+\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\
+\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\
+\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\
+\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\
+\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\
+\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\
+\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\
+\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x09\x4c\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x62\0\0\0\
+\x01\0\0\0\x80\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\
+\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\0\0\0\0\0\x20\
+\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\
+\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\
+\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\
+\0\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1a\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\
+\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe8\xbf\x16\0\0\
+\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x30\0\0\0\0\0\x23\xb7\x50\0\0\
+\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe8\0\0\0\0\xb7\
+\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xf0\0\0\
+\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\
+\0\0\xff\xff\xff\xe8\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x23\
+\xb7\x30\0\0\0\0\0\x0e\xb7\x50\0\0\0\0\0\x18\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\
+\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x3c\
+\x1e\0\0\0\x01\0\0\0\x42\0\0\0\x9a\0\x01\x3c\x24\0\0\0\x02\0\0\0\x42\0\0\x01\
+\x0d\0\x01\x44\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2e\0\x01\x4c\x06\0\0\0\x04\0\0\
+\0\x42\0\0\x01\x3d\0\x01\x40\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x62\0\x01\x58\x06\
+\0\0\0\x07\0\0\0\x42\0\0\x01\x75\0\x01\x5c\x03\0\0\0\x0e\0\0\0\x42\0\0\x01\xfb\
+\0\x01\x64\x02\0\0\0\x1e\0\0\0\x42\0\0\x02\x49\0\x01\x6c\x01\0\0\0\0\0\0\0\x02\
+\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\
+\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x70\0\0\0\x0d\
+\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x09\0\0\0\0\0\0\0\xa0\0\0\0\x0d\
+\0\0\x01\x39\0\0\0\0\0\0\0\x1a\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\
+\x6d\x61\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\
+\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\
+\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\
+\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\0\x79\
+\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\
+\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x31\xb7\x30\0\0\0\0\0\x20\xb7\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\x7b\
+\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\0\0\
+\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\x87\
+\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\
+\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\0\
+\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\
+\xb7\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\
+\0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\
+\xb7\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\
+\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3d\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\
+\xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\
+\xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\
+\xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\
+\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x51\xb7\x30\0\0\0\0\0\x11\
+\xb7\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x80\x1e\0\0\0\x01\0\0\0\
+\x42\0\0\0\x9a\0\x01\x80\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x7f\0\x01\x88\x1f\0\0\
+\0\x03\0\0\0\x42\0\0\x02\xa3\0\x01\x94\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xbc\0\
+\x01\xa0\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3d\0\x01\x84\x1d\0\0\0\x06\0\0\0\x42\
+\0\0\x01\x62\0\x01\xa4\x06\0\0\0\x08\0\0\0\x42\0\0\x02\xce\0\x01\xa8\x03\0\0\0\
+\x10\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x79\0\x01\
+\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x1b\0\0\0\x42\0\0\
+\x03\xca\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x2d\0\0\0\x1e\
+\0\0\0\x42\0\0\x04\x16\0\x01\x0c\x0d\0\0\0\x20\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\
+\x02\0\0\0\x21\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x02\0\0\0\x24\0\0\0\x42\0\0\x04\
+\x3d\0\x01\x18\x0d\0\0\0\x27\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x28\0\0\
+\0\x42\0\0\x04\x3d\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\x3d\0\x01\x18\x0d\
+\0\0\0\x2c\0\0\0\x42\0\0\x04\x6b\0\x01\x1c\x1b\0\0\0\x2d\0\0\0\x42\0\0\x04\x6b\
+\0\x01\x1c\x06\0\0\0\x2e\0\0\0\x42\0\0\x04\x8e\0\x01\x24\x0d\0\0\0\x30\0\0\0\
+\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x49\0\x01\xc0\x01\0\
+\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\
+\x10\0\0\0\x14\0\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\
+\x28\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\
+\x90\0\0\0\x1a\0\0\x01\x09\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\x03\x71\0\0\0\0\0\0\
+\0\xb0\0\0\0\x1a\0\0\x03\x75\0\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\x03\xa3\0\0\0\0\0\
+\0\0\xd8\0\0\0\x20\0\0\x01\x09\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\
+\0\x01\x18\0\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\x1a\0\0\x01\x09\0\0\0\
+\0\0\0\x01\x60\0\0\0\x20\0\0\x04\x65\0\0\0\0\0\0\x01\x88\0\0\0\x1a\0\0\x01\x39\
+\0\0\0\0\0\0\x01\x98\0\0\0\x1a\0\0\x04\xa6\0\0\0\0\0\0\x01\xa0\0\0\0\x18\0\0\0\
+\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\
+\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\
+\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\x12\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ opts.insns_sz = 2216;
+ opts.insns = (void *)"\
+\xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\
+\0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\
+\x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\
+\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x80\0\0\0\0\xd5\
+\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x84\0\0\0\0\xd5\x10\0\x01\0\
+\0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\
+\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0e\x68\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\
+\0\0\0\x0e\x64\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0e\x58\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\
+\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xa0\x63\x10\0\0\0\
+\0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0e\x7c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\x0e\x70\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\
+\x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\
+\x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x0e\xb8\xb7\x20\0\0\0\0\0\x62\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\
+\0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\x63\
+\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x20\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0f\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xb8\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x38\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\x63\x10\
+\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\xb7\
+\x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\
+\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x50\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x11\x70\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x58\x18\x16\0\
+\0\0\0\0\0\0\0\0\0\0\0\x11\x68\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\
+\0\0\x10\x58\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x7b\x10\0\0\0\0\0\0\x18\
+\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\x60\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xc0\
+\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\xf0\x18\x16\0\0\0\0\0\
+\0\0\0\0\0\0\0\x11\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd8\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\
+\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x78\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\
+\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x7c\x63\x10\0\0\0\0\0\0\x79\x06\0\
+\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x80\x7b\x10\0\0\0\0\0\0\x61\
+\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xa8\x63\x10\0\0\0\0\0\
+\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xf0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\
+\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\
+\xff\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\x60\x63\x07\0\x6c\0\0\0\0\
+\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\
+\0\0\0\0\0\0\0\0\0\0\x11\x60\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\
+\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd0\x61\x10\0\0\0\0\0\0\xd5\
+\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x4a\0\0\
+\0\0\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x08\x18\x16\0\
+\0\0\0\0\0\0\0\0\0\0\0\x16\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\
+\0\0\x12\x10\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd8\x7b\x10\0\0\0\0\0\0\x18\
+\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x20\
+\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x20\x18\x16\0\0\0\0\0\
+\0\0\0\0\0\0\0\x17\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x15\
+\xb0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x50\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x48\x7b\x10\0\0\0\0\
+\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xe8\x63\x10\0\0\
+\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xec\x63\x10\
+\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xf0\x7b\
+\x10\0\0\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\
+\x18\x63\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x60\xb7\x20\0\0\0\
+\0\0\x12\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\
+\0\0\0\0\0\0\xc5\x70\xff\x13\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\
+\x63\x07\0\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\
+\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\xb7\x30\0\0\0\0\0\x8c\x85\0\
+\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x40\x61\
+\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\
+\xc5\x70\xff\x01\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\
+\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\
+\0\0\x63\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\
+\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\
+\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
index 70f236a82fe1..70f236a82fe1 100644
--- a/kernel/bpf/preload/iterators/iterators.lskel.h
+++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 9e832acf4692..80f4b4d88aaf 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -447,7 +447,7 @@ BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
.func = bpf_ringbuf_reserve,
- .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
+ .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
@@ -490,7 +490,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_submit_proto = {
.func = bpf_ringbuf_submit,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -503,7 +503,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_discard_proto = {
.func = bpf_ringbuf_discard,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7b373a5e861f..e3fcdc9836a6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -175,13 +175,13 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
synchronize_rcu();
}
-static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
- void *value, __u64 flags)
+static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, __u64 flags)
{
int err;
/* Need to create a kthread, thus must support schedule */
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
@@ -190,7 +190,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
return sock_map_update_elem_sys(map, key, value, flags);
} else if (IS_FD_PROG_ARRAY(map)) {
- return bpf_fd_array_map_update_elem(map, f.file, key, value,
+ return bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
}
@@ -205,12 +205,12 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
flags);
} else if (IS_FD_ARRAY(map)) {
rcu_read_lock();
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
rcu_read_lock();
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
@@ -238,7 +238,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
void *ptr;
int err;
- if (bpf_map_is_dev_bound(map))
+ if (bpf_map_is_offloaded(map))
return bpf_map_offload_lookup_elem(map, key, value);
bpf_disable_instrumentation();
@@ -309,7 +309,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/
- const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
+ gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
unsigned int flags = 0;
unsigned long align = 1;
void *area;
@@ -390,7 +390,7 @@ static int bpf_map_alloc_id(struct bpf_map *map)
return id > 0 ? 0 : id;
}
-void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map)
{
unsigned long flags;
@@ -402,18 +402,12 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
if (!map->id)
return;
- if (do_idr_lock)
- spin_lock_irqsave(&map_idr_lock, flags);
- else
- __acquire(&map_idr_lock);
+ spin_lock_irqsave(&map_idr_lock, flags);
idr_remove(&map_idr, map->id);
map->id = 0;
- if (do_idr_lock)
- spin_unlock_irqrestore(&map_idr_lock, flags);
- else
- __release(&map_idr_lock);
+ spin_unlock_irqrestore(&map_idr_lock, flags);
}
#ifdef CONFIG_MEMCG_KMEM
@@ -424,7 +418,8 @@ static void bpf_map_save_memcg(struct bpf_map *map)
* So we have to check map->objcg for being NULL each time it's
* being used.
*/
- map->objcg = get_obj_cgroup_from_current();
+ if (memcg_bpf_enabled())
+ map->objcg = get_obj_cgroup_from_current();
}
static void bpf_map_release_memcg(struct bpf_map *map)
@@ -470,6 +465,21 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
return ptr;
}
+void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
+ gfp_t flags)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
@@ -495,114 +505,189 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
-static int bpf_map_kptr_off_cmp(const void *a, const void *b)
+static int btf_field_cmp(const void *a, const void *b)
{
- const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
+ const struct btf_field *f1 = a, *f2 = b;
- if (off_desc1->offset < off_desc2->offset)
+ if (f1->offset < f2->offset)
return -1;
- else if (off_desc1->offset > off_desc2->offset)
+ else if (f1->offset > f2->offset)
return 1;
return 0;
}
-struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
+struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
+ enum btf_field_type type)
{
- /* Since members are iterated in btf_find_field in increasing order,
- * offsets appended to kptr_off_tab are in increasing order, so we can
- * do bsearch to find exact match.
- */
- struct bpf_map_value_off *tab;
+ struct btf_field *field;
- if (!map_value_has_kptrs(map))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
+ return NULL;
+ field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
+ if (!field || !(field->type & type))
return NULL;
- tab = map->kptr_off_tab;
- return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
+ return field;
}
-void bpf_map_free_kptr_off_tab(struct bpf_map *map)
+void btf_record_free(struct btf_record *rec)
{
- struct bpf_map_value_off *tab = map->kptr_off_tab;
int i;
- if (!map_value_has_kptrs(map))
+ if (IS_ERR_OR_NULL(rec))
return;
- for (i = 0; i < tab->nr_off; i++) {
- if (tab->off[i].kptr.module)
- module_put(tab->off[i].kptr.module);
- btf_put(tab->off[i].kptr.btf);
+ for (i = 0; i < rec->cnt; i++) {
+ switch (rec->fields[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ if (rec->fields[i].kptr.module)
+ module_put(rec->fields[i].kptr.module);
+ btf_put(rec->fields[i].kptr.btf);
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ /* Nothing to release */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
}
- kfree(tab);
- map->kptr_off_tab = NULL;
+ kfree(rec);
}
-struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
+void bpf_map_free_record(struct bpf_map *map)
{
- struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab;
- int size, i;
+ btf_record_free(map->record);
+ map->record = NULL;
+}
- if (!map_value_has_kptrs(map))
- return ERR_PTR(-ENOENT);
- size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
- new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
- if (!new_tab)
+struct btf_record *btf_record_dup(const struct btf_record *rec)
+{
+ const struct btf_field *fields;
+ struct btf_record *new_rec;
+ int ret, size, i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return NULL;
+ size = offsetof(struct btf_record, fields[rec->cnt]);
+ new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
+ if (!new_rec)
return ERR_PTR(-ENOMEM);
- /* Do a deep copy of the kptr_off_tab */
- for (i = 0; i < tab->nr_off; i++) {
- btf_get(tab->off[i].kptr.btf);
- if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) {
- while (i--) {
- if (tab->off[i].kptr.module)
- module_put(tab->off[i].kptr.module);
- btf_put(tab->off[i].kptr.btf);
+ /* Do a deep copy of the btf_record */
+ fields = rec->fields;
+ new_rec->cnt = 0;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (fields[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ btf_get(fields[i].kptr.btf);
+ if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
+ ret = -ENXIO;
+ goto free;
}
- kfree(new_tab);
- return ERR_PTR(-ENXIO);
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ /* Nothing to acquire */
+ break;
+ default:
+ ret = -EFAULT;
+ WARN_ON_ONCE(1);
+ goto free;
}
+ new_rec->cnt++;
}
- return new_tab;
+ return new_rec;
+free:
+ btf_record_free(new_rec);
+ return ERR_PTR(ret);
}
-bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
+bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
{
- struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
- bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
+ bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
int size;
- if (!a_has_kptr && !b_has_kptr)
+ if (!a_has_fields && !b_has_fields)
return true;
- if (a_has_kptr != b_has_kptr)
+ if (a_has_fields != b_has_fields)
return false;
- if (tab_a->nr_off != tab_b->nr_off)
+ if (rec_a->cnt != rec_b->cnt)
return false;
- size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
- return !memcmp(tab_a, tab_b, size);
+ size = offsetof(struct btf_record, fields[rec_a->cnt]);
+ /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
+ * members are zeroed out. So memcmp is safe to do without worrying
+ * about padding/unused fields.
+ *
+ * While spin_lock, timer, and kptr have no relation to map BTF,
+ * list_head metadata is specific to map BTF, the btf and value_rec
+ * members in particular. btf is the map BTF, while value_rec points to
+ * btf_record in that map BTF.
+ *
+ * So while by default, we don't rely on the map BTF (which the records
+ * were parsed from) matching for both records, which is not backwards
+ * compatible, in case list_head is part of it, we implicitly rely on
+ * that by way of depending on memcmp succeeding for it.
+ */
+ return !memcmp(rec_a, rec_b, size);
}
-/* Caller must ensure map_value_has_kptrs is true. Note that this function can
- * be called on a map value while the map_value is visible to BPF programs, as
- * it ensures the correct synchronization, and we already enforce the same using
- * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs.
- */
-void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
+void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
{
- struct bpf_map_value_off *tab = map->kptr_off_tab;
- unsigned long *btf_id_ptr;
- int i;
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
+ return;
+ bpf_timer_cancel_and_free(obj + rec->timer_off);
+}
- for (i = 0; i < tab->nr_off; i++) {
- struct bpf_map_value_off_desc *off_desc = &tab->off[i];
- unsigned long old_ptr;
+void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
+{
+ const struct btf_field *fields;
+ int i;
- btf_id_ptr = map_value + off_desc->offset;
- if (off_desc->type == BPF_KPTR_UNREF) {
- u64 *p = (u64 *)btf_id_ptr;
+ if (IS_ERR_OR_NULL(rec))
+ return;
+ fields = rec->fields;
+ for (i = 0; i < rec->cnt; i++) {
+ const struct btf_field *field = &fields[i];
+ void *field_ptr = obj + field->offset;
- WRITE_ONCE(*p, 0);
+ switch (fields[i].type) {
+ case BPF_SPIN_LOCK:
+ break;
+ case BPF_TIMER:
+ bpf_timer_cancel_and_free(field_ptr);
+ break;
+ case BPF_KPTR_UNREF:
+ WRITE_ONCE(*(u64 *)field_ptr, 0);
+ break;
+ case BPF_KPTR_REF:
+ field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
+ break;
+ case BPF_LIST_HEAD:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_RB_ROOT:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ break;
+ default:
+ WARN_ON_ONCE(1);
continue;
}
- old_ptr = xchg(btf_id_ptr, 0);
- off_desc->kptr.dtor((void *)old_ptr);
}
}
@@ -610,14 +695,24 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
+ struct btf_field_offs *foffs = map->field_offs;
+ struct btf_record *rec = map->record;
security_bpf_map_free(map);
- kfree(map->off_arr);
bpf_map_release_memcg(map);
- /* implementation dependent freeing, map_free callback also does
- * bpf_map_free_kptr_off_tab, if needed.
- */
+ /* implementation dependent freeing */
map->ops->map_free(map);
+ /* Delay freeing of field_offs and btf_record for maps, as map_free
+ * callback usually needs access to them. It is better to do it here
+ * than require each callback to do the free itself manually.
+ *
+ * Note that the btf_record stashed in map->inner_map_meta->record was
+ * already freed using the map_free callback for map in map case which
+ * eventually calls bpf_map_free_meta, since inner_map_meta is only a
+ * template bpf_map struct used during verification.
+ */
+ kfree(foffs);
+ btf_record_free(rec);
}
static void bpf_map_put_uref(struct bpf_map *map)
@@ -629,13 +724,13 @@ static void bpf_map_put_uref(struct bpf_map *map)
}
/* decrement map refcnt and schedule it for freeing via workqueue
- * (unrelying map implementation ops->map_free() might sleep)
+ * (underlying map implementation ops->map_free() might sleep)
*/
-static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_put(struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
- bpf_map_free_id(map, do_idr_lock);
+ bpf_map_free_id(map);
btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
/* Avoid spawning kworkers, since they all might contend
@@ -644,11 +739,6 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
queue_work(system_unbound_wq, &map->work);
}
}
-
-void bpf_map_put(struct bpf_map *map)
-{
- __bpf_map_put(map, true);
-}
EXPORT_SYMBOL_GPL(bpf_map_put);
void bpf_map_put_with_uref(struct bpf_map *map)
@@ -778,8 +868,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
struct bpf_map *map = filp->private_data;
int err;
- if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
- map_value_has_timer(map) || map_value_has_kptrs(map))
+ if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@@ -906,84 +995,6 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
-static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
-{
- const u32 a = *(const u32 *)_a;
- const u32 b = *(const u32 *)_b;
-
- if (a < b)
- return -1;
- else if (a > b)
- return 1;
- return 0;
-}
-
-static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
-{
- struct bpf_map *map = (struct bpf_map *)priv;
- u32 *off_base = map->off_arr->field_off;
- u32 *a = _a, *b = _b;
- u8 *sz_a, *sz_b;
-
- sz_a = map->off_arr->field_sz + (a - off_base);
- sz_b = map->off_arr->field_sz + (b - off_base);
-
- swap(*a, *b);
- swap(*sz_a, *sz_b);
-}
-
-static int bpf_map_alloc_off_arr(struct bpf_map *map)
-{
- bool has_spin_lock = map_value_has_spin_lock(map);
- bool has_timer = map_value_has_timer(map);
- bool has_kptrs = map_value_has_kptrs(map);
- struct bpf_map_off_arr *off_arr;
- u32 i;
-
- if (!has_spin_lock && !has_timer && !has_kptrs) {
- map->off_arr = NULL;
- return 0;
- }
-
- off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN);
- if (!off_arr)
- return -ENOMEM;
- map->off_arr = off_arr;
-
- off_arr->cnt = 0;
- if (has_spin_lock) {
- i = off_arr->cnt;
-
- off_arr->field_off[i] = map->spin_lock_off;
- off_arr->field_sz[i] = sizeof(struct bpf_spin_lock);
- off_arr->cnt++;
- }
- if (has_timer) {
- i = off_arr->cnt;
-
- off_arr->field_off[i] = map->timer_off;
- off_arr->field_sz[i] = sizeof(struct bpf_timer);
- off_arr->cnt++;
- }
- if (has_kptrs) {
- struct bpf_map_value_off *tab = map->kptr_off_tab;
- u32 *off = &off_arr->field_off[off_arr->cnt];
- u8 *sz = &off_arr->field_sz[off_arr->cnt];
-
- for (i = 0; i < tab->nr_off; i++) {
- *off++ = tab->off[i].offset;
- *sz++ = sizeof(u64);
- }
- off_arr->cnt += tab->nr_off;
- }
-
- if (off_arr->cnt == 1)
- return 0;
- sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]),
- map_off_arr_cmp, map_off_arr_swap, map);
- return 0;
-}
-
static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id)
{
@@ -1006,39 +1017,13 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size)
return -EINVAL;
- map->spin_lock_off = btf_find_spin_lock(btf, value_type);
-
- if (map_value_has_spin_lock(map)) {
- if (map->map_flags & BPF_F_RDONLY_PROG)
- return -EACCES;
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
- map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
- map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
- map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
- return -ENOTSUPP;
- if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
- map->value_size) {
- WARN_ONCE(1,
- "verifier bug spin_lock_off %d value_size %d\n",
- map->spin_lock_off, map->value_size);
- return -EFAULT;
- }
- }
-
- map->timer_off = btf_find_timer(btf, value_type);
- if (map_value_has_timer(map)) {
- if (map->map_flags & BPF_F_RDONLY_PROG)
- return -EACCES;
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY)
- return -EOPNOTSUPP;
- }
+ map->record = btf_parse_fields(btf, value_type,
+ BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
+ BPF_RB_ROOT,
+ map->value_size);
+ if (!IS_ERR_OR_NULL(map->record)) {
+ int i;
- map->kptr_off_tab = btf_parse_kptrs(btf, value_type);
- if (map_value_has_kptrs(map)) {
if (!bpf_capable()) {
ret = -EPERM;
goto free_map_tab;
@@ -1047,15 +1032,61 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
ret = -EACCES;
goto free_map_tab;
}
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
- ret = -EOPNOTSUPP;
- goto free_map_tab;
+ for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
+ switch (map->record->field_mask & (1 << i)) {
+ case 0:
+ continue;
+ case BPF_SPIN_LOCK:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_TIMER:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ default:
+ /* Fail if map_type checks are missing for a field type */
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
}
}
+ ret = btf_check_and_fixup_fields(btf, map->record);
+ if (ret < 0)
+ goto free_map_tab;
+
if (map->ops->map_check_btf) {
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
if (ret < 0)
@@ -1064,7 +1095,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return ret;
free_map_tab:
- bpf_map_free_kptr_off_tab(map);
+ bpf_map_free_record(map);
return ret;
}
@@ -1073,6 +1104,7 @@ free_map_tab:
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
+ struct btf_field_offs *foffs;
struct bpf_map *map;
int f_flags;
int err;
@@ -1117,8 +1149,6 @@ static int map_create(union bpf_attr *attr)
mutex_init(&map->freeze_mutex);
spin_lock_init(&map->owner.lock);
- map->spin_lock_off = -EINVAL;
- map->timer_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
@@ -1154,13 +1184,17 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
- err = bpf_map_alloc_off_arr(map);
- if (err)
+
+ foffs = btf_parse_field_offs(map->record);
+ if (IS_ERR(foffs)) {
+ err = PTR_ERR(foffs);
goto free_map;
+ }
+ map->field_offs = foffs;
err = security_bpf_map_alloc(map);
if (err)
- goto free_map_off_arr;
+ goto free_map_field_offs;
err = bpf_map_alloc_id(map);
if (err)
@@ -1184,8 +1218,8 @@ static int map_create(union bpf_attr *attr)
free_map_sec:
security_bpf_map_free(map);
-free_map_off_arr:
- kfree(map->off_arr);
+free_map_field_offs:
+ kfree(map->field_offs);
free_map:
btf_put(map->btf);
map->ops->map_free(map);
@@ -1332,7 +1366,7 @@ static int map_lookup_elem(union bpf_attr *attr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1405,7 +1439,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1423,7 +1457,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto free_key;
}
- err = bpf_map_update_value(map, f, key, value, attr->flags);
+ err = bpf_map_update_value(map, f.file, key, value, attr->flags);
kvfree(value);
free_key:
@@ -1464,7 +1498,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map) ||
@@ -1528,7 +1562,7 @@ static int map_get_next_key(union bpf_attr *attr)
if (!next_key)
goto free_key;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_get_next_key(map, key, next_key);
goto out;
}
@@ -1568,7 +1602,7 @@ int generic_map_delete_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1586,7 +1620,7 @@ int generic_map_delete_batch(struct bpf_map *map,
map->key_size))
break;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
break;
}
@@ -1609,23 +1643,21 @@ int generic_map_delete_batch(struct bpf_map *map,
return err;
}
-int generic_map_update_batch(struct bpf_map *map,
+int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
- int ufd = attr->batch.map_fd;
void *key, *value;
- struct fd f;
int err = 0;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1645,7 +1677,6 @@ int generic_map_update_batch(struct bpf_map *map,
return -ENOMEM;
}
- f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
@@ -1653,7 +1684,7 @@ int generic_map_update_batch(struct bpf_map *map,
copy_from_user(value, values + cp * value_size, value_size))
break;
- err = bpf_map_update_value(map, f, key, value,
+ err = bpf_map_update_value(map, map_file, key, value,
attr->batch.elem_flags);
if (err)
@@ -1666,7 +1697,6 @@ int generic_map_update_batch(struct bpf_map *map,
kvfree(value);
kvfree(key);
- fdput(f);
return err;
}
@@ -1688,7 +1718,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK))
return -EINVAL;
value_size = bpf_map_value_size(map);
@@ -1810,7 +1840,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1836,7 +1866,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- if (!bpf_map_is_dev_bound(map)) {
+ if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
@@ -1881,8 +1911,7 @@ static int map_freeze(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
- map_value_has_timer(map) || map_value_has_kptrs(map)) {
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
fdput(f);
return -ENOTSUPP;
}
@@ -1930,7 +1959,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
if (!ops)
return -EINVAL;
- if (!bpf_prog_is_dev_bound(prog->aux))
+ if (!bpf_prog_is_offloaded(prog->aux))
prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
@@ -1958,7 +1987,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
return;
if (audit_enabled == AUDIT_OFF)
return;
- if (op == BPF_AUDIT_LOAD)
+ if (!in_irq() && !irqs_disabled())
ctx = audit_context();
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
if (unlikely(!ab))
@@ -1987,7 +2016,7 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
return id > 0 ? 0 : id;
}
-void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+void bpf_prog_free_id(struct bpf_prog *prog)
{
unsigned long flags;
@@ -1999,18 +2028,10 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
if (!prog->aux->id)
return;
- if (do_idr_lock)
- spin_lock_irqsave(&prog_idr_lock, flags);
- else
- __acquire(&prog_idr_lock);
-
+ spin_lock_irqsave(&prog_idr_lock, flags);
idr_remove(&prog_idr, prog->aux->id);
prog->aux->id = 0;
-
- if (do_idr_lock)
- spin_unlock_irqrestore(&prog_idr_lock, flags);
- else
- __release(&prog_idr_lock);
+ spin_unlock_irqrestore(&prog_idr_lock, flags);
}
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
@@ -2053,17 +2074,15 @@ static void bpf_prog_put_deferred(struct work_struct *work)
prog = aux->prog;
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ bpf_prog_free_id(prog);
__bpf_prog_put_noref(prog, true);
}
-static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
+static void __bpf_prog_put(struct bpf_prog *prog)
{
struct bpf_prog_aux *aux = prog->aux;
if (atomic64_dec_and_test(&aux->refcnt)) {
- /* bpf_prog_free_id() must be called first */
- bpf_prog_free_id(prog, do_idr_lock);
-
if (in_irq() || irqs_disabled()) {
INIT_WORK(&aux->work, bpf_prog_put_deferred);
schedule_work(&aux->work);
@@ -2075,7 +2094,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
void bpf_prog_put(struct bpf_prog *prog)
{
- __bpf_prog_put(prog, true);
+ __bpf_prog_put(prog);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -2117,11 +2136,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
st = per_cpu_ptr(prog->stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&st->syncp);
+ start = u64_stats_fetch_begin(&st->syncp);
tnsecs = u64_stats_read(&st->nsecs);
tcnt = u64_stats_read(&st->cnt);
tmisses = u64_stats_read(&st->misses);
- } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ } while (u64_stats_fetch_retry(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
misses += tmisses;
@@ -2241,7 +2260,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,
if (prog->type != *attach_type)
return false;
- if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+ if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
return false;
return true;
@@ -2477,7 +2496,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32 |
- BPF_F_XDP_HAS_FRAGS))
+ BPF_F_XDP_HAS_FRAGS |
+ BPF_F_XDP_DEV_BOUND_ONLY))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2561,7 +2581,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
- prog->aux->offload_requested = !!attr->prog_ifindex;
+ prog->aux->dev_bound = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
@@ -2585,7 +2605,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
- err = bpf_prog_offload_init(prog, attr);
+ err = bpf_prog_dev_bound_init(prog, attr);
+ if (err)
+ goto free_prog_sec;
+ }
+
+ if (type == BPF_PROG_TYPE_EXT && dst_prog &&
+ bpf_prog_is_dev_bound(dst_prog->aux)) {
+ err = bpf_prog_dev_bound_inherit(prog, dst_prog);
if (err)
goto free_prog_sec;
}
@@ -3504,9 +3531,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_LSM:
if (ptype == BPF_PROG_TYPE_LSM &&
prog->expected_attach_type != BPF_LSM_CGROUP)
- return -EINVAL;
-
- ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+ ret = -EINVAL;
+ else
+ ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
default:
ret = -EINVAL;
@@ -3983,7 +4010,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
return -EFAULT;
}
- if (bpf_prog_is_dev_bound(prog->aux)) {
+ if (bpf_prog_is_offloaded(prog->aux)) {
err = bpf_prog_offload_info_fill(&info, prog);
if (err)
return err;
@@ -4211,7 +4238,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
- if (bpf_map_is_dev_bound(map)) {
+ if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_info_fill(&info, map);
if (err)
return err;
@@ -4460,13 +4487,13 @@ put_file:
#define BPF_MAP_BATCH_LAST_FIELD batch.flags
-#define BPF_DO_BATCH(fn) \
+#define BPF_DO_BATCH(fn, ...) \
do { \
if (!fn) { \
err = -ENOTSUPP; \
goto err_put; \
} \
- err = fn(map, attr, uattr); \
+ err = fn(__VA_ARGS__); \
} while (0)
static int bpf_map_do_batch(const union bpf_attr *attr,
@@ -4500,13 +4527,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
}
if (cmd == BPF_MAP_LOOKUP_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
else if (cmd == BPF_MAP_UPDATE_BATCH)
- BPF_DO_BATCH(map->ops->map_update_batch);
+ BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
else
- BPF_DO_BATCH(map->ops->map_delete_batch);
+ BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
if (has_write)
bpf_map_write_active_dec(map);
@@ -5133,13 +5160,14 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
run_ctx.bpf_cookie = 0;
run_ctx.saved_run_ctx = NULL;
- if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) {
+ if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
/* recursion detected */
bpf_prog_put(prog);
return -EBUSY;
}
attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
- __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx);
+ __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
+ &run_ctx);
bpf_prog_put(prog);
return 0;
#endif
@@ -5304,7 +5332,6 @@ static struct ctl_table bpf_syscall_table[] = {
{
.procname = "bpf_stats_enabled",
.data = &bpf_stats_enabled_key.key,
- .maxlen = sizeof(bpf_stats_enabled_key),
.mode = 0644,
.proc_handler = bpf_stats_handler,
},
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c2a2182ce570..c4ab9d6cdbe9 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -438,6 +438,7 @@ struct bpf_iter_seq_task_vma_info {
*/
struct bpf_iter_seq_task_common common;
struct task_struct *task;
+ struct mm_struct *mm;
struct vm_area_struct *vma;
u32 tid;
unsigned long prev_vm_start;
@@ -456,16 +457,19 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
enum bpf_task_vma_iter_find_op op;
struct vm_area_struct *curr_vma;
struct task_struct *curr_task;
+ struct mm_struct *curr_mm;
u32 saved_tid = info->tid;
/* If this function returns a non-NULL vma, it holds a reference to
- * the task_struct, and holds read lock on vma->mm->mmap_lock.
+ * the task_struct, holds a refcount on mm->mm_users, and holds
+ * read lock on vma->mm->mmap_lock.
* If this function returns NULL, it does not hold any reference or
* lock.
*/
if (info->task) {
curr_task = info->task;
curr_vma = info->vma;
+ curr_mm = info->mm;
/* In case of lock contention, drop mmap_lock to unblock
* the writer.
*
@@ -504,13 +508,15 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
* 4.2) VMA2 and VMA2' covers different ranges, process
* VMA2'.
*/
- if (mmap_lock_is_contended(curr_task->mm)) {
+ if (mmap_lock_is_contended(curr_mm)) {
info->prev_vm_start = curr_vma->vm_start;
info->prev_vm_end = curr_vma->vm_end;
op = task_vma_iter_find_vma;
- mmap_read_unlock(curr_task->mm);
- if (mmap_read_lock_killable(curr_task->mm))
+ mmap_read_unlock(curr_mm);
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
goto finish;
+ }
} else {
op = task_vma_iter_next_vma;
}
@@ -535,42 +541,47 @@ again:
op = task_vma_iter_find_vma;
}
- if (!curr_task->mm)
+ curr_mm = get_task_mm(curr_task);
+ if (!curr_mm)
goto next_task;
- if (mmap_read_lock_killable(curr_task->mm))
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
goto finish;
+ }
}
switch (op) {
case task_vma_iter_first_vma:
- curr_vma = find_vma(curr_task->mm, 0);
+ curr_vma = find_vma(curr_mm, 0);
break;
case task_vma_iter_next_vma:
- curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
break;
case task_vma_iter_find_vma:
/* We dropped mmap_lock so it is necessary to use find_vma
* to find the next vma. This is similar to the mechanism
* in show_smaps_rollup().
*/
- curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
+ curr_vma = find_vma(curr_mm, info->prev_vm_end - 1);
/* case 1) and 4.2) above just use curr_vma */
/* check for case 2) or case 4.1) above */
if (curr_vma &&
curr_vma->vm_start == info->prev_vm_start &&
curr_vma->vm_end == info->prev_vm_end)
- curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
break;
}
if (!curr_vma) {
/* case 3) above, or case 2) 4.1) with vma->next == NULL */
- mmap_read_unlock(curr_task->mm);
+ mmap_read_unlock(curr_mm);
+ mmput(curr_mm);
goto next_task;
}
info->task = curr_task;
info->vma = curr_vma;
+ info->mm = curr_mm;
return curr_vma;
next_task:
@@ -579,6 +590,7 @@ next_task:
put_task_struct(curr_task);
info->task = NULL;
+ info->mm = NULL;
info->tid++;
goto again;
@@ -587,6 +599,7 @@ finish:
put_task_struct(curr_task);
info->task = NULL;
info->vma = NULL;
+ info->mm = NULL;
return NULL;
}
@@ -658,7 +671,9 @@ static void task_vma_seq_stop(struct seq_file *seq, void *v)
*/
info->prev_vm_start = ~0UL;
info->prev_vm_end = info->vma->vm_end;
- mmap_read_unlock(info->task->mm);
+ mmap_read_unlock(info->mm);
+ mmput(info->mm);
+ info->mm = NULL;
put_task_struct(info->task);
info->task = NULL;
}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index bf0906e1e2b9..d0ed7d6f5eec 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -468,8 +468,7 @@ again:
if (err < 0)
goto out;
- set_memory_ro((long)im->image, 1);
- set_memory_x((long)im->image, 1);
+ set_memory_rox((long)im->image, 1);
WARN_ON(tr->cur_image && tr->selector == 0);
WARN_ON(!tr->cur_image && tr->selector);
@@ -489,6 +488,10 @@ again:
/* reset fops->func and fops->trampoline for re-register */
tr->fops->func = NULL;
tr->fops->trampoline = 0;
+
+ /* reset im->image memory attr for arch_prepare_bpf_trampoline */
+ set_memory_nx((long)im->image, 1);
+ set_memory_rw((long)im->image, 1);
goto again;
}
#endif
@@ -864,7 +867,7 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
* [2..MAX_U64] - execute bpf prog and record execution time.
* This is start time.
*/
-u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
rcu_read_lock();
@@ -901,7 +904,8 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
}
}
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -912,8 +916,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
rcu_read_unlock();
}
-u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
- struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
/* Runtime stats are exported via actual BPF_LSM_CGROUP
@@ -927,8 +931,8 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
return NO_START_TIME;
}
-void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
- struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -937,7 +941,8 @@ void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
rcu_read_unlock();
}
-u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
{
rcu_read_lock_trace();
migrate_disable();
@@ -953,8 +958,8 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r
return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
- struct bpf_tramp_run_ctx *run_ctx)
+void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -964,8 +969,30 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
rcu_read_unlock_trace();
}
-u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
- struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
rcu_read_lock();
@@ -976,8 +1003,8 @@ u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
- struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -997,6 +1024,36 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
percpu_ref_put(&tr->pcref);
}
+bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->aux->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_enter_sleepable_recur :
+ __bpf_prog_enter_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_enter_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter;
+}
+
+bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->aux->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_exit_sleepable_recur :
+ __bpf_prog_exit_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_exit_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit;
+}
+
int __weak
arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 264b3dc714cc..272563a0b770 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -190,6 +190,10 @@ struct bpf_verifier_stack_elem {
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
+static int ref_set_non_owning(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
@@ -255,6 +259,7 @@ struct bpf_call_arg_meta {
int mem_size;
u64 msize_max_value;
int ref_obj_id;
+ int dynptr_id;
int map_uid;
int func_id;
struct btf *btf;
@@ -262,7 +267,7 @@ struct bpf_call_arg_meta {
struct btf *ret_btf;
u32 ret_btf_id;
u32 subprogno;
- struct bpf_map_value_off_desc *kptr_off_desc;
+ struct btf_field *kptr_field;
u8 uninit_dynptr_regno;
};
@@ -451,17 +456,34 @@ static bool reg_type_not_null(enum bpf_reg_type type)
type == PTR_TO_SOCK_COMMON;
}
-static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+static bool type_is_ptr_alloc_obj(u32 type)
{
- return reg->type == PTR_TO_MAP_VALUE &&
- map_value_has_spin_lock(reg->map_ptr);
+ return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
}
-static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
+static bool type_is_non_owning_ref(u32 type)
{
- type = base_type(type);
- return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_MEM || type == PTR_TO_BTF_ID;
+ return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
+}
+
+static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
+{
+ struct btf_record *rec = NULL;
+ struct btf_struct_meta *meta;
+
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ rec = reg->map_ptr->record;
+ } else if (type_is_ptr_alloc_obj(reg->type)) {
+ meta = btf_find_struct_meta(reg->btf, reg->btf_id);
+ if (meta)
+ rec = meta->record;
+ }
+ return rec;
+}
+
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+{
+ return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
}
static bool type_is_rdonly_mem(u32 type)
@@ -511,6 +533,23 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_dynptr_data;
}
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_for_each_map_elem ||
+ func_id == BPF_FUNC_timer_set_callback ||
+ func_id == BPF_FUNC_find_vma ||
+ func_id == BPF_FUNC_loop ||
+ func_id == BPF_FUNC_user_ringbuf_drain;
+}
+
+static bool is_storage_get_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_sk_storage_get ||
+ func_id == BPF_FUNC_inode_storage_get ||
+ func_id == BPF_FUNC_task_storage_get ||
+ func_id == BPF_FUNC_cgrp_storage_get;
+}
+
static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -541,7 +580,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
static const char *reg_type_str(struct bpf_verifier_env *env,
enum bpf_reg_type type)
{
- char postfix[16] = {0}, prefix[32] = {0};
+ char postfix[16] = {0}, prefix[64] = {0};
static const char * const str[] = {
[NOT_INIT] = "?",
[SCALAR_VALUE] = "scalar",
@@ -563,7 +602,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
[PTR_TO_MAP_KEY] = "map_key",
- [PTR_TO_DYNPTR] = "dynptr_ptr",
+ [CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
};
if (type & PTR_MAYBE_NULL) {
@@ -573,16 +612,15 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
strncpy(postfix, "_or_null", 16);
}
- if (type & MEM_RDONLY)
- strncpy(prefix, "rdonly_", 32);
- if (type & MEM_ALLOC)
- strncpy(prefix, "alloc_", 32);
- if (type & MEM_USER)
- strncpy(prefix, "user_", 32);
- if (type & MEM_PERCPU)
- strncpy(prefix, "percpu_", 32);
- if (type & PTR_UNTRUSTED)
- strncpy(prefix, "untrusted_", 32);
+ snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
+ type & MEM_RDONLY ? "rdonly_" : "",
+ type & MEM_RINGBUF ? "ringbuf_" : "",
+ type & MEM_USER ? "user_" : "",
+ type & MEM_PERCPU ? "percpu_" : "",
+ type & MEM_RCU ? "rcu_" : "",
+ type & PTR_UNTRUSTED ? "untrusted_" : "",
+ type & PTR_TRUSTED ? "trusted_" : ""
+ );
snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
prefix, str[base_type(type)], postfix);
@@ -610,31 +648,57 @@ static void print_liveness(struct bpf_verifier_env *env,
verbose(env, "D");
}
-static int get_spi(s32 off)
+static int __get_spi(s32 off)
{
return (-off - 1) / BPF_REG_SIZE;
}
+static struct bpf_func_state *func(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+
+ return cur->frame[reg->frameno];
+}
+
static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
{
- int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+ int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
- /* We need to check that slots between [spi - nr_slots + 1, spi] are
- * within [0, allocated_stack).
- *
- * Please note that the spi grows downwards. For example, a dynptr
- * takes the size of two stack slots; the first slot will be at
- * spi and the second slot will be at spi - 1.
- */
- return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
+ /* We need to check that slots between [spi - nr_slots + 1, spi] are
+ * within [0, allocated_stack).
+ *
+ * Please note that the spi grows downwards. For example, a dynptr
+ * takes the size of two stack slots; the first slot will be at
+ * spi and the second slot will be at spi - 1.
+ */
+ return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
}
-static struct bpf_func_state *func(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg)
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
- struct bpf_verifier_state *cur = env->cur_state;
+ int off, spi;
- return cur->frame[reg->frameno];
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "dynptr has to be at a constant offset\n");
+ return -EINVAL;
+ }
+
+ off = reg->off + reg->var_off.value;
+ if (off % BPF_REG_SIZE) {
+ verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+ return -EINVAL;
+ }
+
+ spi = __get_spi(off);
+ if (spi < 1) {
+ verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+ return -EINVAL;
+ }
+
+ if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS))
+ return -ERANGE;
+ return spi;
}
static const char *kernel_type_name(const struct btf* btf, u32 id)
@@ -697,17 +761,60 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
return type == BPF_DYNPTR_TYPE_RINGBUF;
}
+static void __mark_dynptr_reg(struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type,
+ bool first_slot, int dynptr_id);
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *sreg1,
+ struct bpf_reg_state *sreg2,
+ enum bpf_dynptr_type type)
+{
+ int id = ++env->id_gen;
+
+ __mark_dynptr_reg(sreg1, type, true, id);
+ __mark_dynptr_reg(sreg2, type, false, id);
+}
+
+static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type)
+{
+ __mark_dynptr_reg(reg, type, true, ++env->id_gen);
+}
+
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi);
+
static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
enum bpf_arg_type arg_type, int insn_idx)
{
struct bpf_func_state *state = func(env, reg);
enum bpf_dynptr_type type;
- int spi, i, id;
-
- spi = get_spi(reg->off);
-
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
- return -EINVAL;
+ int spi, i, id, err;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ /* We cannot assume both spi and spi - 1 belong to the same dynptr,
+ * hence we need to call destroy_if_dynptr_stack_slot twice for both,
+ * to ensure that for the following example:
+ * [d1][d1][d2][d2]
+ * spi 3 2 1 0
+ * So marking spi = 2 should lead to destruction of both d1 and d2. In
+ * case they do belong to same dynptr, second call won't see slot_type
+ * as STACK_DYNPTR and will simply skip destruction.
+ */
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ err = destroy_if_dynptr_stack_slot(env, state, spi - 1);
+ if (err)
+ return err;
for (i = 0; i < BPF_REG_SIZE; i++) {
state->stack[spi].slot_type[i] = STACK_DYNPTR;
@@ -718,9 +825,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (type == BPF_DYNPTR_TYPE_INVALID)
return -EINVAL;
- state->stack[spi].spilled_ptr.dynptr.first_slot = true;
- state->stack[spi].spilled_ptr.dynptr.type = type;
- state->stack[spi - 1].spilled_ptr.dynptr.type = type;
+ mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
+ &state->stack[spi - 1].spilled_ptr, type);
if (dynptr_type_refcounted(type)) {
/* The id is used to track proper releasing */
@@ -728,10 +834,13 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (id < 0)
return id;
- state->stack[spi].spilled_ptr.id = id;
- state->stack[spi - 1].spilled_ptr.id = id;
+ state->stack[spi].spilled_ptr.ref_obj_id = id;
+ state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
}
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
return 0;
}
@@ -740,10 +849,9 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
struct bpf_func_state *state = func(env, reg);
int spi, i;
- spi = get_spi(reg->off);
-
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
- return -EINVAL;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
for (i = 0; i < BPF_REG_SIZE; i++) {
state->stack[spi].slot_type[i] = STACK_INVALID;
@@ -751,46 +859,138 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
}
/* Invalidate any slices associated with this dynptr */
- if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
- release_reference(env, state->stack[spi].spilled_ptr.id);
- state->stack[spi].spilled_ptr.id = 0;
- state->stack[spi - 1].spilled_ptr.id = 0;
- }
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type))
+ WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id));
- state->stack[spi].spilled_ptr.dynptr.first_slot = false;
- state->stack[spi].spilled_ptr.dynptr.type = 0;
- state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot?
+ *
+ * While we don't allow reading STACK_INVALID, it is still possible to
+ * do <8 byte writes marking some but not all slots as STACK_MISC. Then,
+ * helpers or insns can do partial read of that part without failing,
+ * but check_stack_range_initialized, check_stack_read_var_off, and
+ * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of
+ * the slot conservatively. Hence we need to prevent those liveness
+ * marking walks.
+ *
+ * This was not a problem before because STACK_INVALID is only set by
+ * default (where the default reg state has its reg->parent as NULL), or
+ * in clean_live_states after REG_LIVE_DONE (at which point
+ * mark_reg_read won't walk reg->parent chain), but not randomly during
+ * verifier state exploration (like we did above). Hence, for our case
+ * parentage chain will still be live (i.e. reg->parent may be
+ * non-NULL), while earlier reg->parent was NULL, so we need
+ * REG_LIVE_WRITTEN to screen off read marker propagation when it is
+ * done later on reads or by mark_dynptr_read as well to unnecessary
+ * mark registers in verifier state.
+ */
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
return 0;
}
-static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi)
{
- struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
- int i;
+ struct bpf_func_state *fstate;
+ struct bpf_reg_state *dreg;
+ int i, dynptr_id;
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
- return true;
+ /* We always ensure that STACK_DYNPTR is never set partially,
+ * hence just checking for slot_type[0] is enough. This is
+ * different for STACK_SPILL, where it may be only set for
+ * 1 byte, so code has to use is_spilled_reg.
+ */
+ if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
+ return 0;
+
+ /* Reposition spi to first slot */
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ spi = spi + 1;
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ verbose(env, "cannot overwrite referenced dynptr\n");
+ return -EINVAL;
+ }
+
+ mark_stack_slot_scratched(env, spi);
+ mark_stack_slot_scratched(env, spi - 1);
+
+ /* Writing partially to one dynptr stack slot destroys both. */
for (i = 0; i < BPF_REG_SIZE; i++) {
- if (state->stack[spi].slot_type[i] == STACK_DYNPTR ||
- state->stack[spi - 1].slot_type[i] == STACK_DYNPTR)
- return false;
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
}
+ dynptr_id = state->stack[spi].spilled_ptr.id;
+ /* Invalidate any slices associated with this dynptr */
+ bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
+ /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
+ if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
+ continue;
+ if (dreg->dynptr_id == dynptr_id) {
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, dreg);
+ else
+ __mark_reg_unknown(env, dreg);
+ }
+ }));
+
+ /* Do not release reference state, we are destroying dynptr on stack,
+ * not using some helper to release it. Just reset register.
+ */
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ /* Same reason as unmark_stack_slots_dynptr above */
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ return 0;
+}
+
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi)
+{
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return false;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ if (spi < 0)
+ return spi == -ERANGE;
+ /* We allow overwriting existing unreferenced STACK_DYNPTR slots, see
+ * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to
+ * ensure dynptr objects at the slots we are touching are completely
+ * destructed before we reinitialize them for a new one. For referenced
+ * ones, destroy_if_dynptr_stack_slot returns an error early instead of
+ * delaying it until the end where the user will get "Unreleased
+ * reference" error.
+ */
return true;
}
-bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi)
{
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
int i;
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
- !state->stack[spi].spilled_ptr.dynptr.first_slot)
+ /* This already represents first slot of initialized bpf_dynptr */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return true;
+
+ if (spi < 0)
+ return false;
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
return false;
for (i = 0; i < BPF_REG_SIZE; i++) {
@@ -802,21 +1002,26 @@ bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
return true;
}
-bool is_dynptr_type_expected(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg,
- enum bpf_arg_type arg_type)
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type)
{
struct bpf_func_state *state = func(env, reg);
enum bpf_dynptr_type dynptr_type;
- int spi = get_spi(reg->off);
+ int spi;
/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
if (arg_type == ARG_PTR_TO_DYNPTR)
return true;
dynptr_type = arg_to_dynptr_type(arg_type);
-
- return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ return reg->dynptr.type == dynptr_type;
+ } else {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
+ return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+ }
}
/* The reg state of a pointer or a bounded scalar was saved when
@@ -875,8 +1080,10 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (reg->id)
verbose_a("id=%d", reg->id);
- if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id)
+ if (reg->ref_obj_id)
verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+ if (type_is_non_owning_ref(reg->type))
+ verbose_a("%s", "non_own_ref");
if (t != SCALAR_VALUE)
verbose_a("off=%d", reg->off);
if (type_is_pkt_pointer(t))
@@ -1000,6 +1207,8 @@ static void print_insn_state(struct bpf_verifier_env *env,
*/
static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
{
+ size_t alloc_bytes;
+ void *orig = dst;
size_t bytes;
if (ZERO_OR_NULL_PTR(src))
@@ -1008,11 +1217,11 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- if (ksize(dst) < bytes) {
- kfree(dst);
- dst = kmalloc_track_caller(bytes, flags);
- if (!dst)
- return NULL;
+ alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes));
+ dst = krealloc(orig, alloc_bytes, flags);
+ if (!dst) {
+ kfree(orig);
+ return NULL;
}
memcpy(dst, src, bytes);
@@ -1027,12 +1236,14 @@ out:
*/
static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
{
+ size_t alloc_size;
void *new_arr;
if (!new_n || old_n == new_n)
goto out;
- new_arr = krealloc_array(arr, new_n, size, GFP_KERNEL);
+ alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
+ new_arr = krealloc(arr, alloc_size, GFP_KERNEL);
if (!new_arr) {
kfree(arr);
return NULL;
@@ -1204,8 +1415,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->frame[i] = NULL;
}
dst_state->speculative = src->speculative;
+ dst_state->active_rcu_lock = src->active_rcu_lock;
dst_state->curframe = src->curframe;
- dst_state->active_spin_lock = src->active_spin_lock;
+ dst_state->active_lock.ptr = src->active_lock.ptr;
+ dst_state->active_lock.id = src->active_lock.id;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
@@ -1324,9 +1537,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg);
-
/* This helper doesn't clear reg->id */
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
@@ -1347,9 +1557,11 @@ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
*/
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- /* Clear id, off, and union(map_ptr, range) */
+ /* Clear off and union(map_ptr, range) */
memset(((u8 *)reg) + sizeof(reg->type), 0,
offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
+ reg->id = 0;
+ reg->ref_obj_id = 0;
___mark_reg_known(reg, imm);
}
@@ -1389,6 +1601,21 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
__mark_reg_known_zero(regs + regno);
}
+static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
+ bool first_slot, int dynptr_id)
+{
+ /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
+ * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
+ * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
+ */
+ __mark_reg_known_zero(reg);
+ reg->type = CONST_PTR_TO_DYNPTR;
+ /* Give each dynptr a unique id to uniquely associate slices to it. */
+ reg->id = dynptr_id;
+ reg->dynptr.type = type;
+ reg->dynptr.first_slot = first_slot;
+}
+
static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
{
if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
@@ -1400,7 +1627,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
/* transfer reg's id which is unique for every map_lookup_elem
* as UID of the inner map.
*/
- if (map_value_has_timer(map->inner_map_meta))
+ if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
@@ -1416,6 +1643,16 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
reg->type &= ~PTR_MAYBE_NULL;
}
+static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
+ struct btf_field_graph_root *ds_head)
+{
+ __mark_reg_known_zero(&regs[regno]);
+ regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[regno].btf = ds_head->btf;
+ regs[regno].btf_id = ds_head->value_btf_id;
+ regs[regno].off = ds_head->node_offset;
+}
+
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
{
return type_is_pkt_pointer(reg->type);
@@ -1682,14 +1919,16 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
{
/*
- * Clear type, id, off, and union(map_ptr, range) and
+ * Clear type, off, and union(map_ptr, range) and
* padding between 'type' and union
*/
memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->ref_obj_id = 0;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
+ reg->precise = !env->bpf_capable;
__mark_reg_unbounded(reg);
}
@@ -2115,6 +2354,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -EINVAL;
}
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
+ if (err)
+ return err;
+ }
+
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
desc->imm = call_imm;
@@ -2316,6 +2561,32 @@ static int mark_reg_read(struct bpf_verifier_env *env,
return 0;
}
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, ret;
+
+ /* For CONST_PTR_TO_DYNPTR, it must have already been done by
+ * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
+ * check_kfunc_call.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return 0;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ /* Caller ensures dynptr is valid and initialized, which means spi is in
+ * bounds and spi is the first dynptr slot. Simply mark stack slot as
+ * read.
+ */
+ ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
+ state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
+ if (ret)
+ return ret;
+ return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
+ state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
+}
+
/* This function is supposed to be used by the following 32-bit optimization
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
@@ -2498,15 +2769,30 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].jmp_point = true;
+}
+
+static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].jmp_point;
+}
+
/* for any branch, call, exit record the history of jmps in the given state */
static int push_jmp_history(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur)
{
u32 cnt = cur->jmp_history_cnt;
struct bpf_idx_pair *p;
+ size_t alloc_size;
+
+ if (!is_jmp_point(env, env->insn_idx))
+ return 0;
cnt++;
- p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
if (!p)
return -ENOMEM;
p[cnt - 1].idx = env->insn_idx;
@@ -2658,6 +2944,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (opcode == BPF_CALL) {
if (insn->src_reg == BPF_PSEUDO_CALL)
return -ENOTSUPP;
+ /* BPF helpers that invoke callback subprogs are
+ * equivalent to BPF_PSEUDO_CALL above
+ */
+ if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
+ return -ENOTSUPP;
+ /* kfunc with imm==0 is invalid and fixup_kfunc_call will
+ * catch this error later. Make backtracking conservative
+ * with ENOTSUPP.
+ */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
+ return -ENOTSUPP;
/* regular helper call sets R0 */
*reg_mask &= ~1;
if (*reg_mask & 0x3f) {
@@ -2747,8 +3044,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
/* big hammer: mark all scalars precise in this path.
* pop_stack may still get !precise scalars.
+ * We also skip current state and go straight to first parent state,
+ * because precision markings in current non-checkpointed state are
+ * not needed. See why in the comment in __mark_chain_precision below.
*/
- for (; st; st = st->parent)
+ for (st = st->parent; st; st = st->parent) {
for (i = 0; i <= st->curframe; i++) {
func = st->frame[i];
for (j = 0; j < BPF_REG_FP; j++) {
@@ -2766,9 +3066,122 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
reg->precise = true;
}
}
+ }
+}
+
+static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ }
}
-static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
+/*
+ * __mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
+ * SCALARS, as well as any other registers and slots that contribute to
+ * a tracked state of given registers/stack slots, depending on specific BPF
+ * assembly instructions (see backtrack_insns() for exact instruction handling
+ * logic). This backtracking relies on recorded jmp_history and is able to
+ * traverse entire chain of parent states. This process ends only when all the
+ * necessary registers/slots and their transitive dependencies are marked as
+ * precise.
+ *
+ * One important and subtle aspect is that precise marks *do not matter* in
+ * the currently verified state (current state). It is important to understand
+ * why this is the case.
+ *
+ * First, note that current state is the state that is not yet "checkpointed",
+ * i.e., it is not yet put into env->explored_states, and it has no children
+ * states as well. It's ephemeral, and can end up either a) being discarded if
+ * compatible explored state is found at some point or BPF_EXIT instruction is
+ * reached or b) checkpointed and put into env->explored_states, branching out
+ * into one or more children states.
+ *
+ * In the former case, precise markings in current state are completely
+ * ignored by state comparison code (see regsafe() for details). Only
+ * checkpointed ("old") state precise markings are important, and if old
+ * state's register/slot is precise, regsafe() assumes current state's
+ * register/slot as precise and checks value ranges exactly and precisely. If
+ * states turn out to be compatible, current state's necessary precise
+ * markings and any required parent states' precise markings are enforced
+ * after the fact with propagate_precision() logic, after the fact. But it's
+ * important to realize that in this case, even after marking current state
+ * registers/slots as precise, we immediately discard current state. So what
+ * actually matters is any of the precise markings propagated into current
+ * state's parent states, which are always checkpointed (due to b) case above).
+ * As such, for scenario a) it doesn't matter if current state has precise
+ * markings set or not.
+ *
+ * Now, for the scenario b), checkpointing and forking into child(ren)
+ * state(s). Note that before current state gets to checkpointing step, any
+ * processed instruction always assumes precise SCALAR register/slot
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
+ * verifier takes this opportunity enthusiastically. Similarly, when
+ * register's value is used to calculate offset or memory address, exact
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
+ * what we mentioned above about state comparison ignoring precise markings
+ * during state comparison, BPF verifier ignores and also assumes precise
+ * markings *at will* during instruction verification process. But as verifier
+ * assumes precision, it also propagates any precision dependencies across
+ * parent states, which are not yet finalized, so can be further restricted
+ * based on new knowledge gained from restrictions enforced by their children
+ * states. This is so that once those parent states are finalized, i.e., when
+ * they have no more active children state, state comparison logic in
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
+ * required for correctness.
+ *
+ * To build a bit more intuition, note also that once a state is checkpointed,
+ * the path we took to get to that state is not important. This is crucial
+ * property for state pruning. When state is checkpointed and finalized at
+ * some instruction index, it can be correctly and safely used to "short
+ * circuit" any *compatible* state that reaches exactly the same instruction
+ * index. I.e., if we jumped to that instruction from a completely different
+ * code path than original finalized state was derived from, it doesn't
+ * matter, current state can be discarded because from that instruction
+ * forward having a compatible state will ensure we will safely reach the
+ * exit. States describe preconditions for further exploration, but completely
+ * forget the history of how we got here.
+ *
+ * This also means that even if we needed precise SCALAR range to get to
+ * finalized state, but from that point forward *that same* SCALAR register is
+ * never used in a precise context (i.e., it's precise value is not needed for
+ * correctness), it's correct and safe to mark such register as "imprecise"
+ * (i.e., precise marking set to false). This is what we rely on when we do
+ * not set precise marking in current state. If no child state requires
+ * precision for any given SCALAR register, it's safe to dictate that it can
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
+ *
+ * Skipping precise marking setting in current state is a mild version of
+ * relying on the above observation. But we can utilize this property even
+ * more aggressively by proactively forgetting any precise marking in the
+ * current state (which we inherited from the parent state), right before we
+ * checkpoint it and branch off into new child state. This is done by
+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
+ * finalized states which help in short circuiting more future states.
+ */
+static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
int spi)
{
struct bpf_verifier_state *st = env->cur_state;
@@ -2785,18 +3198,18 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
if (!env->bpf_capable)
return 0;
- func = st->frame[st->curframe];
+ /* Do sanity checks against current state of register and/or stack
+ * slot, but don't set precise flag in current state, as precision
+ * tracking in the current state is unnecessary.
+ */
+ func = st->frame[frame];
if (regno >= 0) {
reg = &func->regs[regno];
if (reg->type != SCALAR_VALUE) {
WARN_ONCE(1, "backtracing misuse");
return -EFAULT;
}
- if (!reg->precise)
- new_marks = true;
- else
- reg_mask = 0;
- reg->precise = true;
+ new_marks = true;
}
while (spi >= 0) {
@@ -2809,11 +3222,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
stack_mask = 0;
break;
}
- if (!reg->precise)
- new_marks = true;
- else
- stack_mask = 0;
- reg->precise = true;
+ new_marks = true;
break;
}
@@ -2821,12 +3230,42 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
return 0;
if (!reg_mask && !stack_mask)
return 0;
+
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+
+ if (last_idx < 0) {
+ /* we are at the entry into subprog, which
+ * is expected for global funcs, but only if
+ * requested precise registers are R1-R5
+ * (which are global func's input arguments)
+ */
+ if (st->curframe == 0 &&
+ st->frame[0]->subprogno > 0 &&
+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
+ stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ reg = &st->frame[0]->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ reg_mask &= ~(1u << i);
+ continue;
+ }
+ reg->precise = true;
+ }
+ return 0;
+ }
+
+ verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
+ st->frame[0]->subprogno, reg_mask, stack_mask);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+
for (i = last_idx;;) {
if (skip_first) {
err = 0;
@@ -2866,7 +3305,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
break;
new_marks = false;
- func = st->frame[st->curframe];
+ func = st->frame[frame];
bitmap_from_u64(mask, reg_mask);
for_each_set_bit(i, mask, 32) {
reg = &func->regs[i];
@@ -2932,12 +3371,17 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
- return __mark_chain_precision(env, regno, -1);
+ return __mark_chain_precision(env, env->cur_state->curframe, regno, -1);
+}
+
+static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno)
+{
+ return __mark_chain_precision(env, frame, regno, -1);
}
-static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
+static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi)
{
- return __mark_chain_precision(env, -1, spi);
+ return __mark_chain_precision(env, frame, -1, spi);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
@@ -3000,13 +3444,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
+/* Copy src state preserving dst->parent and dst->live fields */
+static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
+{
+ struct bpf_reg_state *parent = dst->parent;
+ enum bpf_reg_liveness live = dst->live;
+
+ *dst = *src;
+ dst->parent = parent;
+ dst->live = live;
+}
+
static void save_register_state(struct bpf_func_state *state,
int spi, struct bpf_reg_state *reg,
int size)
{
int i;
- state->stack[spi].spilled_ptr = *reg;
+ copy_register_state(&state->stack[spi].spilled_ptr, reg);
if (size == BPF_REG_SIZE)
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
@@ -3018,6 +3473,11 @@ static void save_register_state(struct bpf_func_state *state,
scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
}
+static bool is_bpf_st_mem(struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
+}
+
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -3029,8 +3489,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
- u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
struct bpf_reg_state *reg = NULL;
+ u32 dst_reg = insn->dst_reg;
err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
if (err)
@@ -3052,7 +3513,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
bool sanitize = reg && is_spillable_regtype(reg->type);
for (i = 0; i < size; i++) {
- if (state->stack[spi].slot_type[i] == STACK_INVALID) {
+ u8 type = state->stack[spi].slot_type[i];
+
+ if (type != STACK_MISC && type != STACK_ZERO) {
sanitize = true;
break;
}
@@ -3062,6 +3525,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
}
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+
mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
@@ -3077,6 +3544,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
return err;
}
save_register_state(state, spi, reg, size);
+ } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
+ insn->imm != 0 && env->bpf_capable) {
+ struct bpf_reg_state fake_reg = {};
+
+ __mark_reg_known(&fake_reg, (u32)insn->imm);
+ fake_reg.type = SCALAR_VALUE;
+ save_register_state(state, spi, &fake_reg, size);
} else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -3111,7 +3585,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
/* when we zero initialize stack slots mark them as such */
- if (reg && register_is_null(reg)) {
+ if ((reg && register_is_null(reg)) ||
+ (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
/* backtracking doesn't work for STACK_ZERO yet. */
err = mark_chain_precision(env, value_regno);
if (err)
@@ -3156,6 +3631,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
int min_off, max_off;
int i, err;
struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
bool writing_zero = false;
/* set if the fact that we're writing a zero is used to let any
* stack slots remain STACK_ZERO
@@ -3168,13 +3644,22 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
max_off = ptr_reg->smax_value + off + size;
if (value_regno >= 0)
value_reg = &cur->regs[value_regno];
- if (value_reg && register_is_null(value_reg))
+ if ((value_reg && register_is_null(value_reg)) ||
+ (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
writing_zero = true;
err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
if (err)
return err;
+ for (i = min_off; i < max_off; i++) {
+ int spi;
+
+ spi = __get_spi(i);
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ }
/* Variable offset writes destroy any spilled pointers in range. */
for (i = min_off; i < max_off; i++) {
@@ -3186,14 +3671,17 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
mark_stack_slot_scratched(env, spi);
- if (!env->allow_ptr_leaks
- && *stype != NOT_INIT
- && *stype != SCALAR_VALUE) {
- /* Reject the write if there's are spilled pointers in
- * range. If we didn't reject here, the ptr status
- * would be erased below (even though not all slots are
- * actually overwritten), possibly opening the door to
- * leaks.
+ if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
+ /* Reject the write if range we may write to has not
+ * been initialized beforehand. If we didn't reject
+ * here, the ptr status would be erased below (even
+ * though not all slots are actually overwritten),
+ * possibly opening the door to leaks.
+ *
+ * We do however catch STACK_INVALID case below, and
+ * only allow reading possibly uninitialized memory
+ * later for CAP_PERFMON, as the write may not happen to
+ * that slot.
*/
verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
insn_idx, i);
@@ -3329,7 +3817,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
*/
s32 subreg_def = state->regs[dst_regno].subreg_def;
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
} else {
for (i = 0; i < size; i++) {
@@ -3350,7 +3838,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
if (dst_regno >= 0) {
/* restore register state from stack */
- state->regs[dst_regno] = *reg;
+ copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
@@ -3683,15 +4171,15 @@ int check_ptr_off_reg(struct bpf_verifier_env *env,
}
static int map_kptr_match_type(struct bpf_verifier_env *env,
- struct bpf_map_value_off_desc *off_desc,
+ struct btf_field *kptr_field,
struct bpf_reg_state *reg, u32 regno)
{
- const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id);
- int perm_flags = PTR_MAYBE_NULL;
+ const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+ int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
const char *reg_name = "";
/* Only unreferenced case accepts untrusted pointers */
- if (off_desc->type == BPF_KPTR_UNREF)
+ if (kptr_field->type == BPF_KPTR_UNREF)
perm_flags |= PTR_UNTRUSTED;
if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
@@ -3738,15 +4226,15 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
* strict mode to true for type match.
*/
if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- off_desc->kptr.btf, off_desc->kptr.btf_id,
- off_desc->type == BPF_KPTR_REF))
+ kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+ kptr_field->type == BPF_KPTR_REF))
goto bad_type;
return 0;
bad_type:
verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
reg_type_str(env, reg->type), reg_name);
verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
- if (off_desc->type == BPF_KPTR_UNREF)
+ if (kptr_field->type == BPF_KPTR_UNREF)
verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
targ_name);
else
@@ -3756,7 +4244,7 @@ bad_type:
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
int value_regno, int insn_idx,
- struct bpf_map_value_off_desc *off_desc)
+ struct btf_field *kptr_field)
{
struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
int class = BPF_CLASS(insn->code);
@@ -3766,7 +4254,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
* - Reject cases where variable offset may touch kptr
* - size of access (must be BPF_DW)
* - tnum_is_const(reg->var_off)
- * - off_desc->offset == off + reg->var_off.value
+ * - kptr_field->offset == off + reg->var_off.value
*/
/* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
if (BPF_MODE(insn->code) != BPF_MEM) {
@@ -3777,7 +4265,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
/* We only allow loading referenced kptr, since it will be marked as
* untrusted, similar to unreferenced kptr.
*/
- if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) {
+ if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
verbose(env, "store to referenced kptr disallowed\n");
return -EACCES;
}
@@ -3787,19 +4275,19 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
/* We can simply mark the value_regno receiving the pointer
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
- mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf,
- off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
/* For mark_ptr_or_null_reg */
val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) {
val_reg = reg_state(env, value_regno);
if (!register_is_null(val_reg) &&
- map_kptr_match_type(env, off_desc, val_reg, value_regno))
+ map_kptr_match_type(env, kptr_field, val_reg, value_regno))
return -EACCES;
} else if (class == BPF_ST) {
if (insn->imm) {
verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
- off_desc->offset);
+ kptr_field->offset);
return -EACCES;
}
} else {
@@ -3818,45 +4306,30 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
struct bpf_map *map = reg->map_ptr;
- int err;
+ struct btf_record *rec;
+ int err, i;
err = check_mem_region_access(env, regno, off, size, map->value_size,
zero_size_allowed);
if (err)
return err;
- if (map_value_has_spin_lock(map)) {
- u32 lock = map->spin_lock_off;
+ if (IS_ERR_OR_NULL(map->record))
+ return 0;
+ rec = map->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 p = field->offset;
- /* if any part of struct bpf_spin_lock can be touched by
- * load/store reject this program.
- * To check that [x1, x2) overlaps with [y1, y2)
+ /* If any part of a field can be touched by load/store, reject
+ * this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
- lock < reg->umax_value + off + size) {
- verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
- return -EACCES;
- }
- }
- if (map_value_has_timer(map)) {
- u32 t = map->timer_off;
-
- if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
- t < reg->umax_value + off + size) {
- verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
- return -EACCES;
- }
- }
- if (map_value_has_kptrs(map)) {
- struct bpf_map_value_off *tab = map->kptr_off_tab;
- int i;
-
- for (i = 0; i < tab->nr_off; i++) {
- u32 p = tab->off[i].offset;
-
- if (reg->smin_value + off < p + sizeof(u64) &&
- p < reg->umax_value + off + size) {
+ if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
+ p < reg->umax_value + off + size) {
+ switch (field->type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
if (src != ACCESS_DIRECT) {
verbose(env, "kptr cannot be accessed indirectly by helper\n");
return -EACCES;
@@ -3875,10 +4348,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
break;
+ default:
+ verbose(env, "%s cannot be accessed directly by load/store\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
}
}
}
- return err;
+ return 0;
}
#define MAX_PACKET_OFF 0xffff
@@ -4095,6 +4572,30 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static bool is_trusted_reg(const struct bpf_reg_state *reg)
+{
+ /* A referenced register is always trusted. */
+ if (reg->ref_obj_id)
+ return true;
+
+ /* If a register is not referenced, it is trusted if it has the
+ * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
+ * other type modifiers may be safe, but we elect to take an opt-in
+ * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
+ * not.
+ *
+ * Eventually, we should make PTR_TRUSTED the single source of truth
+ * for whether a register is trusted.
+ */
+ return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
+ !bpf_type_has_unsafe_modifiers(reg->type);
+}
+
+static bool is_rcu_reg(const struct bpf_reg_state *reg)
+{
+ return reg->type & MEM_RCU;
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -4498,6 +4999,25 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
return 0;
}
+#define BTF_TYPE_SAFE_NESTED(__type) __PASTE(__type, __safe_fields)
+
+BTF_TYPE_SAFE_NESTED(struct task_struct) {
+ const cpumask_t *cpus_ptr;
+};
+
+static bool nested_ptr_is_trusted(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ int off)
+{
+ /* If its parent is not trusted, it can't regain its trusted status. */
+ if (!is_trusted_reg(reg))
+ return false;
+
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct));
+
+ return btf_nested_type_is_trusted(&env->log, reg, off);
+}
+
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *regs,
int regno, int off, int size,
@@ -4511,6 +5031,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
u32 btf_id;
int ret;
+ if (!env->allow_ptr_leaks) {
+ verbose(env,
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ tname);
+ return -EPERM;
+ }
+ if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
+ verbose(env,
+ "Cannot access kernel 'struct %s' from non-GPL compatible program\n",
+ tname);
+ return -EINVAL;
+ }
if (off < 0) {
verbose(env,
"R%d is ptr_%s invalid negative access: off=%d\n",
@@ -4541,17 +5073,29 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- if (env->ops->btf_struct_access) {
- ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
- off, size, atype, &btf_id, &flag);
+ if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) {
+ if (!btf_is_kernel(reg->btf)) {
+ verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
+ return -EFAULT;
+ }
+ ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
} else {
- if (atype != BPF_READ) {
+ /* Writes are permitted with default btf_struct_access for
+ * program allocated objects (which always have ref_obj_id > 0),
+ * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
+ */
+ if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "only read is supported\n");
return -EACCES;
}
- ret = btf_struct_access(&env->log, reg->btf, t, off, size,
- atype, &btf_id, &flag);
+ if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
+ !reg->ref_obj_id) {
+ verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
+ return -EFAULT;
+ }
+
+ ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
}
if (ret < 0)
@@ -4563,6 +5107,37 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (type_flag(reg->type) & PTR_UNTRUSTED)
flag |= PTR_UNTRUSTED;
+ /* By default any pointer obtained from walking a trusted pointer is no
+ * longer trusted, unless the field being accessed has explicitly been
+ * marked as inheriting its parent's state of trust.
+ *
+ * An RCU-protected pointer can also be deemed trusted if we are in an
+ * RCU read region. This case is handled below.
+ */
+ if (nested_ptr_is_trusted(env, reg, off))
+ flag |= PTR_TRUSTED;
+ else
+ flag &= ~PTR_TRUSTED;
+
+ if (flag & MEM_RCU) {
+ /* Mark value register as MEM_RCU only if it is protected by
+ * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU
+ * itself can already indicate trustedness inside the rcu
+ * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
+ * it could be null in some cases.
+ */
+ if (!env->cur_state->active_rcu_lock ||
+ !(is_trusted_reg(reg) || is_rcu_reg(reg)))
+ flag &= ~MEM_RCU;
+ else
+ flag |= PTR_MAYBE_NULL;
+ } else if (reg->type & MEM_RCU) {
+ /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
+ * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
+ */
+ flag |= PTR_UNTRUSTED;
+ }
+
if (atype == BPF_READ && value_regno >= 0)
mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
@@ -4577,6 +5152,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
{
struct bpf_reg_state *reg = regs + regno;
struct bpf_map *map = reg->map_ptr;
+ struct bpf_reg_state map_reg;
enum bpf_type_flag flag = 0;
const struct btf_type *t;
const char *tname;
@@ -4597,9 +5173,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
tname = btf_name_by_offset(btf_vmlinux, t->name_off);
- if (!env->allow_ptr_to_map_access) {
+ if (!env->allow_ptr_leaks) {
verbose(env,
- "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
tname);
return -EPERM;
}
@@ -4615,7 +5191,10 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag);
+ /* Simulate access to a PTR_TO_BTF_ID */
+ memset(&map_reg, 0, sizeof(map_reg));
+ mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
+ ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);
if (ret < 0)
return ret;
@@ -4751,7 +5330,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_MAP_VALUE) {
- struct bpf_map_value_off_desc *kptr_off_desc = NULL;
+ struct btf_field *kptr_field = NULL;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
@@ -4765,11 +5344,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err)
return err;
if (tnum_is_const(reg->var_off))
- kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr,
- off + reg->var_off.value);
- if (kptr_off_desc) {
- err = check_map_kptr_access(env, regno, value_regno, insn_idx,
- kptr_off_desc);
+ kptr_field = btf_record_find(reg->map_ptr->record,
+ off + reg->var_off.value, BPF_KPTR);
+ if (kptr_field) {
+ err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
} else if (t == BPF_READ && value_regno >= 0) {
struct bpf_map *map = reg->map_ptr;
@@ -5136,6 +5714,31 @@ static int check_stack_range_initialized(
}
if (meta && meta->raw_mode) {
+ /* Ensure we won't be overwriting dynptrs when simulating byte
+ * by byte access in check_helper_call using meta.access_size.
+ * This would be a problem if we have a helper in the future
+ * which takes:
+ *
+ * helper(uninit_mem, len, dynptr)
+ *
+ * Now, uninint_mem may overlap with dynptr pointer. Hence, it
+ * may end up writing to dynptr itself when touching memory from
+ * arg 1. This can be relaxed on a case by case basis for known
+ * safe cases, but reject due to the possibilitiy of aliasing by
+ * default.
+ */
+ for (i = min_off; i < max_off + access_size; i++) {
+ int stack_off = -i - 1;
+
+ spi = __get_spi(i);
+ /* raw_mode may write past allocated_stack */
+ if (state->allocated_stack <= stack_off)
+ continue;
+ if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
+ verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
+ return -EACCES;
+ }
+ }
meta->access_size = access_size;
meta->regno = regno;
return 0;
@@ -5160,10 +5763,6 @@ static int check_stack_range_initialized(
}
if (is_spilled_reg(&state->stack[spi]) &&
- base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID)
- goto mark;
-
- if (is_spilled_reg(&state->stack[spi]) &&
(state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
env->allow_ptr_leaks)) {
if (clobber) {
@@ -5193,6 +5792,11 @@ mark:
mark_reg_read(env, &state->stack[spi].spilled_ptr,
state->stack[spi].spilled_ptr.parent,
REG_LIVE_READ64);
+ /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not
+ * be sure that whether stack slot is written to or not. Hence,
+ * we must still conservatively propagate reads upwards even if
+ * helper may write to the entire memory range.
+ */
}
return update_stack_depth(env, state, min_off);
}
@@ -5374,8 +5978,8 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return err;
}
-int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- u32 regno)
+static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno)
{
struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
bool may_be_null = type_may_be_null(mem_reg->type);
@@ -5403,23 +6007,26 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state
}
/* Implementation details:
- * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
+ * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
* Two bpf_map_lookups (even with the same key) will have different reg->id.
- * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
- * value_or_null->value transition, since the verifier only cares about
- * the range of access to valid map value pointer and doesn't care about actual
- * address of the map element.
+ * Two separate bpf_obj_new will also have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier
+ * clears reg->id after value_or_null->value transition, since the verifier only
+ * cares about the range of access to valid map value pointer and doesn't care
+ * about actual address of the map element.
* For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
* reg->id > 0 after value_or_null->value transition. By doing so
* two bpf_map_lookups will be considered two different pointers that
- * point to different bpf_spin_locks.
+ * point to different bpf_spin_locks. Likewise for pointers to allocated objects
+ * returned from bpf_obj_new.
* The verifier allows taking only one bpf_spin_lock at a time to avoid
* dead-locks.
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
- * cur_state->active_spin_lock remembers which map value element got locked
- * and clears it after bpf_spin_unlock.
+ * cur_state->active_lock remembers which map value element or allocated
+ * object got locked and clears it after bpf_spin_unlock.
*/
static int process_spin_lock(struct bpf_verifier_env *env, int regno,
bool is_lock)
@@ -5427,8 +6034,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
- struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
+ struct bpf_map *map = NULL;
+ struct btf *btf = NULL;
+ struct btf_record *rec;
if (!is_const) {
verbose(env,
@@ -5436,49 +6045,62 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
regno);
return -EINVAL;
}
- if (!map->btf) {
- verbose(env,
- "map '%s' has to have BTF in order to use bpf_spin_lock\n",
- map->name);
- return -EINVAL;
- }
- if (!map_value_has_spin_lock(map)) {
- if (map->spin_lock_off == -E2BIG)
- verbose(env,
- "map '%s' has more than one 'struct bpf_spin_lock'\n",
- map->name);
- else if (map->spin_lock_off == -ENOENT)
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ map = reg->map_ptr;
+ if (!map->btf) {
verbose(env,
- "map '%s' doesn't have 'struct bpf_spin_lock'\n",
- map->name);
- else
- verbose(env,
- "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
map->name);
+ return -EINVAL;
+ }
+ } else {
+ btf = reg->btf;
+ }
+
+ rec = reg_btf_record(reg);
+ if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) {
+ verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local",
+ map ? map->name : "kptr");
return -EINVAL;
}
- if (map->spin_lock_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
- val + reg->off);
+ if (rec->spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n",
+ val + reg->off, rec->spin_lock_off);
return -EINVAL;
}
if (is_lock) {
- if (cur->active_spin_lock) {
+ if (cur->active_lock.ptr) {
verbose(env,
"Locking two bpf_spin_locks are not allowed\n");
return -EINVAL;
}
- cur->active_spin_lock = reg->id;
+ if (map)
+ cur->active_lock.ptr = map;
+ else
+ cur->active_lock.ptr = btf;
+ cur->active_lock.id = reg->id;
} else {
- if (!cur->active_spin_lock) {
+ void *ptr;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (!cur->active_lock.ptr) {
verbose(env, "bpf_spin_unlock without taking a lock\n");
return -EINVAL;
}
- if (cur->active_spin_lock != reg->id) {
+ if (cur->active_lock.ptr != ptr ||
+ cur->active_lock.id != reg->id) {
verbose(env, "bpf_spin_unlock of different lock\n");
return -EINVAL;
}
- cur->active_spin_lock = 0;
+
+ invalidate_non_owning_refs(env);
+
+ cur->active_lock.ptr = NULL;
+ cur->active_lock.id = 0;
}
return 0;
}
@@ -5502,24 +6124,13 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
map->name);
return -EINVAL;
}
- if (!map_value_has_timer(map)) {
- if (map->timer_off == -E2BIG)
- verbose(env,
- "map '%s' has more than one 'struct bpf_timer'\n",
- map->name);
- else if (map->timer_off == -ENOENT)
- verbose(env,
- "map '%s' doesn't have 'struct bpf_timer'\n",
- map->name);
- else
- verbose(env,
- "map '%s' is not a struct type or bpf_timer is mangled\n",
- map->name);
+ if (!btf_record_has_field(map->record, BPF_TIMER)) {
+ verbose(env, "map '%s' has no valid bpf_timer\n", map->name);
return -EINVAL;
}
- if (map->timer_off != val + reg->off) {
+ if (map->record->timer_off != val + reg->off) {
verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
- val + reg->off, map->timer_off);
+ val + reg->off, map->record->timer_off);
return -EINVAL;
}
if (meta->map_ptr) {
@@ -5535,10 +6146,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map_value_off_desc *off_desc;
struct bpf_map *map_ptr = reg->map_ptr;
+ struct btf_field *kptr_field;
u32 kptr_off;
- int ret;
if (!tnum_is_const(reg->var_off)) {
verbose(env,
@@ -5551,30 +6161,145 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
map_ptr->name);
return -EINVAL;
}
- if (!map_value_has_kptrs(map_ptr)) {
- ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab);
- if (ret == -E2BIG)
- verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name,
- BPF_MAP_VALUE_OFF_MAX);
- else if (ret == -EEXIST)
- verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name);
- else
- verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+ if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
+ verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
return -EINVAL;
}
meta->map_ptr = map_ptr;
kptr_off = reg->off + reg->var_off.value;
- off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off);
- if (!off_desc) {
+ kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
+ if (!kptr_field) {
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
return -EACCES;
}
- if (off_desc->type != BPF_KPTR_REF) {
+ if (kptr_field->type != BPF_KPTR_REF) {
verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
return -EACCES;
}
- meta->kptr_off_desc = off_desc;
+ meta->kptr_field = kptr_field;
+ return 0;
+}
+
+/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+ * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
+ *
+ * In both cases we deal with the first 8 bytes, but need to mark the next 8
+ * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
+ * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
+ *
+ * Mutability of bpf_dynptr is at two levels, one is at the level of struct
+ * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
+ * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
+ * mutate the view of the dynptr and also possibly destroy it. In the latter
+ * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
+ * memory that dynptr points to.
+ *
+ * The verifier will keep track both levels of mutation (bpf_dynptr's in
+ * reg->type and the memory's in reg->dynptr.type), but there is no support for
+ * readonly dynptr view yet, hence only the first case is tracked and checked.
+ *
+ * This is consistent with how C applies the const modifier to a struct object,
+ * where the pointer itself inside bpf_dynptr becomes const but not what it
+ * points to.
+ *
+ * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
+ * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ */
+int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+ enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int spi = 0;
+
+ /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
+ * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
+ */
+ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
+ verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
+ return -EFAULT;
+ }
+ /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+ * check_func_arg_reg_off's logic. We only need to check offset
+ * and its alignment for PTR_TO_STACK.
+ */
+ if (reg->type == PTR_TO_STACK) {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0 && spi != -ERANGE)
+ return spi;
+ }
+
+ /* MEM_UNINIT - Points to memory that is an appropriate candidate for
+ * constructing a mutable bpf_dynptr object.
+ *
+ * Currently, this is only possible with PTR_TO_STACK
+ * pointing to a region of at least 16 bytes which doesn't
+ * contain an existing bpf_dynptr.
+ *
+ * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
+ * mutated or destroyed. However, the memory it points to
+ * may be mutated.
+ *
+ * None - Points to a initialized dynptr that can be mutated and
+ * destroyed, including mutation of the memory it points
+ * to.
+ */
+ if (arg_type & MEM_UNINIT) {
+ if (!is_dynptr_reg_valid_uninit(env, reg, spi)) {
+ verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+ return -EINVAL;
+ }
+
+ /* We only support one dynptr being uninitialized at the moment,
+ * which is sufficient for the helper functions we have right now.
+ */
+ if (meta->uninit_dynptr_regno) {
+ verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
+ return -EFAULT;
+ }
+
+ meta->uninit_dynptr_regno = regno;
+ } else /* MEM_RDONLY and None case from above */ {
+ int err;
+
+ /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
+ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
+ verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_reg_valid_init(env, reg, spi)) {
+ verbose(env,
+ "Expected an initialized dynptr as arg #%d\n",
+ regno);
+ return -EINVAL;
+ }
+
+ /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
+ if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
+ const char *err_extra = "";
+
+ switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+ case DYNPTR_TYPE_LOCAL:
+ err_extra = "local";
+ break;
+ case DYNPTR_TYPE_RINGBUF:
+ err_extra = "ringbuf";
+ break;
+ default:
+ err_extra = "<unknown>";
+ break;
+ }
+ verbose(env,
+ "Expected a dynptr of type %s as arg #%d\n",
+ err_extra, regno);
+ return -EINVAL;
+ }
+
+ err = mark_dynptr_read(env, reg);
+ if (err)
+ return err;
+ }
return 0;
}
@@ -5639,16 +6364,6 @@ struct bpf_reg_types {
u32 *btf_id;
};
-static const struct bpf_reg_types map_key_value_types = {
- .types = {
- PTR_TO_STACK,
- PTR_TO_PACKET,
- PTR_TO_PACKET_META,
- PTR_TO_MAP_KEY,
- PTR_TO_MAP_VALUE,
- },
-};
-
static const struct bpf_reg_types sock_types = {
.types = {
PTR_TO_SOCK_COMMON,
@@ -5666,6 +6381,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = {
PTR_TO_TCP_SOCK,
PTR_TO_XDP_SOCK,
PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
},
.btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
};
@@ -5679,7 +6395,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
- PTR_TO_MEM | MEM_ALLOC,
+ PTR_TO_MEM | MEM_RINGBUF,
PTR_TO_BUF,
},
};
@@ -5694,14 +6410,31 @@ static const struct bpf_reg_types int_ptr_types = {
},
};
+static const struct bpf_reg_types spin_lock_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC,
+ }
+};
+
static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
-static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } };
+static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
-static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
-static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } };
+static const struct bpf_reg_types btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ PTR_TO_BTF_ID | MEM_RCU,
+ },
+};
+static const struct bpf_reg_types percpu_btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID | MEM_PERCPU,
+ PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
+ }
+};
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
@@ -5710,13 +6443,13 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }
static const struct bpf_reg_types dynptr_types = {
.types = {
PTR_TO_STACK,
- PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL,
+ CONST_PTR_TO_DYNPTR,
}
};
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
- [ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
- [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
+ [ARG_PTR_TO_MAP_KEY] = &mem_types,
+ [ARG_PTR_TO_MAP_VALUE] = &mem_types,
[ARG_CONST_SIZE] = &scalar_types,
[ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
@@ -5730,7 +6463,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
- [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
+ [ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types,
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
@@ -5789,7 +6522,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
found:
- if (reg->type == PTR_TO_BTF_ID) {
+ if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) {
/* For bpf_sk_release, it needs to match against first member
* 'struct sock_common', hence make an exception for it. This
* allows bpf_sk_release to work for multiple socket types.
@@ -5806,7 +6539,7 @@ found:
}
if (meta->func_id == BPF_FUNC_kptr_xchg) {
- if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
return -EACCES;
} else {
if (arg_btf_id == BPF_PTR_POISON) {
@@ -5825,75 +6558,140 @@ found:
return -EACCES;
}
}
+ } else if (type_is_alloc(reg->type)) {
+ if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) {
+ verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
+ return -EFAULT;
+ }
}
return 0;
}
+static struct btf_field *
+reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
+{
+ struct btf_field *field;
+ struct btf_record *rec;
+
+ rec = reg_btf_record(reg);
+ if (!rec)
+ return NULL;
+
+ field = btf_record_find(rec, off, fields);
+ if (!field)
+ return NULL;
+
+ return field;
+}
+
int check_func_arg_reg_off(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg, int regno,
enum bpf_arg_type arg_type)
{
- enum bpf_reg_type type = reg->type;
- bool fixed_off_ok = false;
+ u32 type = reg->type;
- switch ((u32)type) {
- /* Pointer types where reg offset is explicitly allowed: */
- case PTR_TO_STACK:
- if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) {
- verbose(env, "cannot pass in dynptr at an offset\n");
+ /* When referenced register is passed to release function, its fixed
+ * offset must be 0.
+ *
+ * We will check arg_type_is_release reg has ref_obj_id when storing
+ * meta->release_regno.
+ */
+ if (arg_type_is_release(arg_type)) {
+ /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
+ * may not directly point to the object being released, but to
+ * dynptr pointing to such object, which might be at some offset
+ * on the stack. In that case, we simply to fallback to the
+ * default handling.
+ */
+ if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
+ return 0;
+
+ if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) {
+ if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT))
+ return __check_ptr_off_reg(env, reg, regno, true);
+
+ verbose(env, "R%d must have zero offset when passed to release func\n",
+ regno);
+ verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
+ kernel_type_name(reg->btf, reg->btf_id), reg->off);
return -EINVAL;
}
- fallthrough;
+
+ /* Doing check_ptr_off_reg check for the offset will catch this
+ * because fixed_off_ok is false, but checking here allows us
+ * to give the user a better error message.
+ */
+ if (reg->off) {
+ verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
+ regno);
+ return -EINVAL;
+ }
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+
+ switch (type) {
+ /* Pointer types where both fixed and variable offset is explicitly allowed: */
+ case PTR_TO_STACK:
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
case PTR_TO_MEM:
case PTR_TO_MEM | MEM_RDONLY:
- case PTR_TO_MEM | MEM_ALLOC:
+ case PTR_TO_MEM | MEM_RINGBUF:
case PTR_TO_BUF:
case PTR_TO_BUF | MEM_RDONLY:
case SCALAR_VALUE:
- /* Some of the argument types nevertheless require a
- * zero register offset.
- */
- if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM)
- return 0;
- break;
+ return 0;
/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
* fixed offset.
*/
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
/* When referenced PTR_TO_BTF_ID is passed to release function,
- * it's fixed offset must be 0. In the other cases, fixed offset
- * can be non-zero.
- */
- if (arg_type_is_release(arg_type) && reg->off) {
- verbose(env, "R%d must have zero offset when passed to release func\n",
- regno);
- return -EINVAL;
- }
- /* For arg is release pointer, fixed_off_ok must be false, but
- * we already checked and rejected reg->off != 0 above, so set
- * to true to allow fixed offset for all other cases.
+ * its fixed offset must be 0. In the other cases, fixed offset
+ * can be non-zero. This was already checked above. So pass
+ * fixed_off_ok as true to allow fixed offset for all other
+ * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
+ * still need to do checks instead of returning.
*/
- fixed_off_ok = true;
- break;
+ return __check_ptr_off_reg(env, reg, regno, true);
default:
- break;
+ return __check_ptr_off_reg(env, reg, regno, false);
}
- return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
}
-static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
+ int spi;
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
return state->stack[spi].spilled_ptr.id;
}
+static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->ref_obj_id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
const struct bpf_func_proto *fn)
@@ -5940,7 +6738,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
goto skip_type_check;
/* arg_btf_id and arg_size are in a union. */
- if (base_type(arg_type) == ARG_PTR_TO_BTF_ID)
+ if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
+ base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
arg_btf_id = fn->arg_btf_id[arg];
err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
@@ -5955,11 +6754,21 @@ skip_type_check:
if (arg_type_is_release(arg_type)) {
if (arg_type_is_dynptr(arg_type)) {
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
+ int spi;
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
- !state->stack[spi].spilled_ptr.id) {
- verbose(env, "arg %d is an unacquired reference\n", regno);
+ /* Only dynptr created on stack can be released, thus
+ * the get_spi and stack state checks for spilled_ptr
+ * should only be done before process_dynptr_func for
+ * PTR_TO_STACK.
+ */
+ if (reg->type == PTR_TO_STACK) {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
+ verbose(env, "arg %d is an unacquired reference\n", regno);
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "cannot release unowned const bpf_dynptr\n");
return -EINVAL;
}
} else if (!reg->ref_obj_id && !register_is_null(reg)) {
@@ -6055,20 +6864,27 @@ skip_type_check:
meta->ret_btf_id = reg->btf_id;
break;
case ARG_PTR_TO_SPIN_LOCK:
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "can't spin_{lock,unlock} in rbtree cb\n");
+ return -EACCES;
+ }
if (meta->func_id == BPF_FUNC_spin_lock) {
- if (process_spin_lock(env, regno, true))
- return -EACCES;
+ err = process_spin_lock(env, regno, true);
+ if (err)
+ return err;
} else if (meta->func_id == BPF_FUNC_spin_unlock) {
- if (process_spin_lock(env, regno, false))
- return -EACCES;
+ err = process_spin_lock(env, regno, false);
+ if (err)
+ return err;
} else {
verbose(env, "verifier internal error\n");
return -EFAULT;
}
break;
case ARG_PTR_TO_TIMER:
- if (process_timer_func(env, regno, meta))
- return -EACCES;
+ err = process_timer_func(env, regno, meta);
+ if (err)
+ return err;
break;
case ARG_PTR_TO_FUNC:
meta->subprogno = reg->subprogno;
@@ -6091,52 +6907,9 @@ skip_type_check:
err = check_mem_size_reg(env, reg, regno, true, meta);
break;
case ARG_PTR_TO_DYNPTR:
- /* We only need to check for initialized / uninitialized helper
- * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
- * assumption is that if it is, that a helper function
- * initialized the dynptr on behalf of the BPF program.
- */
- if (base_type(reg->type) == PTR_TO_DYNPTR)
- break;
- if (arg_type & MEM_UNINIT) {
- if (!is_dynptr_reg_valid_uninit(env, reg)) {
- verbose(env, "Dynptr has to be an uninitialized dynptr\n");
- return -EINVAL;
- }
-
- /* We only support one dynptr being uninitialized at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- if (meta->uninit_dynptr_regno) {
- verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
- return -EFAULT;
- }
-
- meta->uninit_dynptr_regno = regno;
- } else if (!is_dynptr_reg_valid_init(env, reg)) {
- verbose(env,
- "Expected an initialized dynptr as arg #%d\n",
- arg + 1);
- return -EINVAL;
- } else if (!is_dynptr_type_expected(env, reg, arg_type)) {
- const char *err_extra = "";
-
- switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
- case DYNPTR_TYPE_LOCAL:
- err_extra = "local";
- break;
- case DYNPTR_TYPE_RINGBUF:
- err_extra = "ringbuf";
- break;
- default:
- err_extra = "<unknown>";
- break;
- }
- verbose(env,
- "Expected a dynptr of type %s as arg #%d\n",
- err_extra, arg + 1);
- return -EINVAL;
- }
+ err = process_dynptr_func(env, regno, arg_type, meta);
+ if (err)
+ return err;
break;
case ARG_CONST_ALLOC_SIZE_OR_ZERO:
if (!tnum_is_const(reg->var_off)) {
@@ -6203,8 +6976,9 @@ skip_type_check:
break;
}
case ARG_PTR_TO_KPTR:
- if (process_kptr_func(env, regno, meta))
- return -EACCES;
+ err = process_kptr_func(env, regno, meta);
+ if (err)
+ return err;
break;
}
@@ -6365,6 +7139,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_task_storage_delete)
goto error;
break;
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ if (func_id != BPF_FUNC_cgrp_storage_get &&
+ func_id != BPF_FUNC_cgrp_storage_delete)
+ goto error;
+ break;
case BPF_MAP_TYPE_BLOOM_FILTER:
if (func_id != BPF_FUNC_map_peek_elem &&
func_id != BPF_FUNC_map_push_elem)
@@ -6477,6 +7256,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
goto error;
break;
+ case BPF_FUNC_cgrp_storage_get:
+ case BPF_FUNC_cgrp_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
+ goto error;
+ break;
default:
break;
}
@@ -6548,9 +7332,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
int i;
for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
- if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
- return false;
-
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID)
+ return !!fn->arg_btf_id[i];
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK)
+ return fn->arg_btf_id[i] == BPF_PTR_POISON;
if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
/* arg_btf_id and arg_size are in a union. */
(base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
@@ -6634,6 +7419,17 @@ static int release_reference(struct bpf_verifier_env *env,
return 0;
}
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
+{
+ struct bpf_func_state *unused;
+ struct bpf_reg_state *reg;
+
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (type_is_non_owning_ref(reg->type))
+ __mark_reg_unknown(env, reg);
+ }));
+}
+
static void clear_caller_saved_regs(struct bpf_verifier_env *env,
struct bpf_reg_state *regs)
{
@@ -6651,6 +7447,12 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx);
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx);
+
+static bool is_callback_calling_kfunc(u32 btf_id);
+
static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx, int subprog,
set_callee_state_fn set_callee_state_cb)
@@ -6701,6 +7503,24 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
+ /* set_callee_state is used for direct subprog calls, but we are
+ * interested in validating only BPF helpers that can call subprogs as
+ * callbacks
+ */
+ if (set_callee_state_cb != set_callee_state) {
+ if (bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_kfunc(insn->imm)) {
+ verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ } else if (!bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_function(insn->imm)) { /* helper */
+ verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+ }
+
if (insn->code == (BPF_JMP | BPF_CALL) &&
insn->src_reg == 0 &&
insn->imm == BPF_FUNC_timer_set_callback) {
@@ -6947,11 +7767,10 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
{
/* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
* callback_ctx, u64 flags);
- * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx);
+ * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
*/
__mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
- callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL;
- __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
/* unused */
@@ -6964,6 +7783,63 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ * bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
+ *
+ * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset
+ * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
+ * by this point, so look at 'root'
+ */
+ struct btf_field *field;
+
+ field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
+ BPF_RB_ROOT);
+ if (!field || !field->graph_root.value_btf_id)
+ return -EFAULT;
+
+ mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_1]);
+ mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_2]);
+
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
+ return 0;
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id);
+
+/* Are we currently verifying the callback for a rbtree helper that must
+ * be called with lock held? If so, no need to complain about unreleased
+ * lock
+ */
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_insn *insn = env->prog->insnsi;
+ struct bpf_func_state *callee;
+ int kfunc_btf_id;
+
+ if (!state->curframe)
+ return false;
+
+ callee = state->frame[state->curframe];
+
+ if (!callee->in_callback_fn)
+ return false;
+
+ kfunc_btf_id = insn[callee->callsite].imm;
+ return is_rbtree_lock_required_kfunc(kfunc_btf_id);
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -7156,6 +8032,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
struct bpf_map *fmt_map = fmt_reg->map_ptr;
+ struct bpf_bprintf_data data = {};
int err, fmt_map_off, num_args;
u64 fmt_addr;
char *fmt;
@@ -7180,7 +8057,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
/* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
* can focus on validating the format specifiers.
*/
- err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args);
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data);
if (err < 0)
verbose(env, "Invalid format string\n");
@@ -7283,6 +8160,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
+ if (!env->prog->aux->sleepable && fn->might_sleep) {
+ verbose(env, "helper call might sleep in a non-sleepable prog\n");
+ return -EINVAL;
+ }
+
/* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(fn->func);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
@@ -7301,6 +8183,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
+ if (env->cur_state->active_rcu_lock) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->sleepable && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -7329,7 +8222,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs = cur_regs(env);
+ /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr
+ * is safe to do directly.
+ */
if (meta.uninit_dynptr_regno) {
+ if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) {
+ verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n");
+ return -EFAULT;
+ }
/* we write BPF_DW bits (8 bytes) at a time */
for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
@@ -7347,15 +8248,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (meta.release_regno) {
err = -EINVAL;
- if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1]))
+ /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+ * is safe to do directly.
+ */
+ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
+ if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
+ verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
+ return -EFAULT;
+ }
err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
- else if (meta.ref_obj_id)
+ } else if (meta.ref_obj_id) {
err = release_reference(env, meta.ref_obj_id);
- /* meta.ref_obj_id can only be 0 if register that is meant to be
- * released is NULL, which must be > R0.
- */
- else if (register_is_null(&regs[meta.release_regno]))
+ } else if (register_is_null(&regs[meta.release_regno])) {
+ /* meta.ref_obj_id can only be 0 if register that is meant to be
+ * released is NULL, which must be > R0.
+ */
err = 0;
+ }
if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
func_id_name(func_id), func_id);
@@ -7423,17 +8333,32 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
if (arg_type_is_dynptr(fn->arg_type[i])) {
struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
+ int id, ref_obj_id;
+
+ if (meta.dynptr_id) {
+ verbose(env, "verifier internal error: meta.dynptr_id already set\n");
+ return -EFAULT;
+ }
if (meta.ref_obj_id) {
verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
return -EFAULT;
}
- if (base_type(reg->type) != PTR_TO_DYNPTR)
- /* Find the id of the dynptr we're
- * tracking the reference of
- */
- meta.ref_obj_id = stack_slot_get_id(env, reg);
+ id = dynptr_id(env, reg);
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
+
+ ref_obj_id = dynptr_ref_obj_id(env, reg);
+ if (ref_obj_id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+ return ref_obj_id;
+ }
+
+ meta.dynptr_id = id;
+ meta.ref_obj_id = ref_obj_id;
break;
}
}
@@ -7488,7 +8413,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].map_uid = meta.map_uid;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
if (!type_may_be_null(ret_type) &&
- map_value_has_spin_lock(meta.map_ptr)) {
+ btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
break;
@@ -7504,7 +8429,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
break;
- case RET_PTR_TO_ALLOC_MEM:
+ case RET_PTR_TO_MEM:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
@@ -7552,8 +8477,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
if (func_id == BPF_FUNC_kptr_xchg) {
- ret_btf = meta.kptr_off_desc->kptr.btf;
- ret_btf_id = meta.kptr_off_desc->kptr.btf_id;
+ ret_btf = meta.kptr_field->kptr.btf;
+ ret_btf_id = meta.kptr_field->kptr.btf_id;
} else {
if (fn->ret_btf_id == BPF_PTR_POISON) {
verbose(env, "verifier internal error:");
@@ -7589,6 +8514,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EFAULT;
}
+ if (is_dynptr_ref_function(func_id))
+ regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
+
if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
@@ -7667,19 +8595,1197 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
+struct bpf_kfunc_call_arg_meta {
+ /* In parameters */
+ struct btf *btf;
+ u32 func_id;
+ u32 kfunc_flags;
+ const struct btf_type *func_proto;
+ const char *func_name;
+ /* Out parameters */
+ u32 ref_obj_id;
+ u8 release_regno;
+ bool r0_rdonly;
+ u32 ret_btf_id;
+ u64 r0_size;
+ u32 subprogno;
+ struct {
+ u64 value;
+ bool found;
+ } arg_constant;
+ struct {
+ struct btf *btf;
+ u32 btf_id;
+ } arg_obj_drop;
+ struct {
+ struct btf_field *field;
+ } arg_list_head;
+ struct {
+ struct btf_field *field;
+ } arg_rbtree_root;
+};
+
+static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ACQUIRE;
+}
+
+static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RET_NULL;
+}
+
+static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RELEASE;
+}
+
+static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_TRUSTED_ARGS;
+}
+
+static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_SLEEPABLE;
+}
+
+static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_DESTRUCTIVE;
+}
+
+static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU;
+}
+
+static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+ return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
+}
+
+static bool __kfunc_param_match_suffix(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *suffix)
+{
+ int suffix_len = strlen(suffix), len;
+ const char *param_name;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len < suffix_len)
+ return false;
+ param_name += len - suffix_len;
+ return !strncmp(param_name, suffix, suffix_len);
+}
+
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return __kfunc_param_match_suffix(btf, arg, "__sz");
+}
+
+static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__k");
+}
+
+static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__ign");
+}
+
+static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__alloc");
+}
+
+static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *name)
+{
+ int len, target_len = strlen(name);
+ const char *param_name;
+
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len != target_len)
+ return false;
+ if (strcmp(param_name, name))
+ return false;
+
+ return true;
+}
+
+enum {
+ KF_ARG_DYNPTR_ID,
+ KF_ARG_LIST_HEAD_ID,
+ KF_ARG_LIST_NODE_ID,
+ KF_ARG_RB_ROOT_ID,
+ KF_ARG_RB_NODE_ID,
+};
+
+BTF_ID_LIST(kf_arg_btf_ids)
+BTF_ID(struct, bpf_dynptr_kern)
+BTF_ID(struct, bpf_list_head)
+BTF_ID(struct, bpf_list_node)
+BTF_ID(struct, bpf_rb_root)
+BTF_ID(struct, bpf_rb_node)
+
+static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
+ const struct btf_param *arg, int type)
+{
+ const struct btf_type *t;
+ u32 res_id;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t)
+ return false;
+ if (!btf_type_is_ptr(t))
+ return false;
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ if (!t)
+ return false;
+ return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
+}
+
+static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID);
+}
+
+static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID);
+}
+
+static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
+}
+
+static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID);
+}
+
+static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
+}
+
+static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
+ const struct btf_param *arg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_resolve_func_ptr(btf, arg->type, NULL);
+ if (!t)
+ return false;
+
+ return true;
+}
+
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
+{
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
+
+ if (!btf_type_is_struct(t))
+ return false;
+
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
+
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ verbose(env, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
+ }
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ }
+ return true;
+}
+
+
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
+enum kfunc_ptr_arg_type {
+ KF_ARG_PTR_TO_CTX,
+ KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
+ KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */
+ KF_ARG_PTR_TO_DYNPTR,
+ KF_ARG_PTR_TO_LIST_HEAD,
+ KF_ARG_PTR_TO_LIST_NODE,
+ KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
+ KF_ARG_PTR_TO_MEM,
+ KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
+ KF_ARG_PTR_TO_CALLBACK,
+ KF_ARG_PTR_TO_RB_ROOT,
+ KF_ARG_PTR_TO_RB_NODE,
+};
+
+enum special_kfunc_type {
+ KF_bpf_obj_new_impl,
+ KF_bpf_obj_drop_impl,
+ KF_bpf_list_push_front,
+ KF_bpf_list_push_back,
+ KF_bpf_list_pop_front,
+ KF_bpf_list_pop_back,
+ KF_bpf_cast_to_kern_ctx,
+ KF_bpf_rdonly_cast,
+ KF_bpf_rcu_read_lock,
+ KF_bpf_rcu_read_unlock,
+ KF_bpf_rbtree_remove,
+ KF_bpf_rbtree_add,
+ KF_bpf_rbtree_first,
+};
+
+BTF_SET_START(special_kfunc_set)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_list_push_front)
+BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_first)
+BTF_SET_END(special_kfunc_set)
+
+BTF_ID_LIST(special_kfunc_list)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_list_push_front)
+BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rcu_read_lock)
+BTF_ID(func, bpf_rcu_read_unlock)
+BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_first)
+
+static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
+}
+
+static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
+}
+
+static enum kfunc_ptr_arg_type
+get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const struct btf_type *t, const struct btf_type *ref_t,
+ const char *ref_tname, const struct btf_param *args,
+ int argno, int nargs)
+{
+ u32 regno = argno + 1;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[regno];
+ bool arg_mem_size = false;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+ return KF_ARG_PTR_TO_CTX;
+
+ /* In this function, we verify the kfunc's BTF as per the argument type,
+ * leaving the rest of the verification with respect to the register
+ * type to our caller. When a set of conditions hold in the BTF type of
+ * arguments, we resolve it to a known kfunc_ptr_arg_type.
+ */
+ if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+ return KF_ARG_PTR_TO_CTX;
+
+ if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_ALLOC_BTF_ID;
+
+ if (is_kfunc_arg_kptr_get(meta, argno)) {
+ if (!btf_type_is_ptr(ref_t)) {
+ verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n");
+ return -EINVAL;
+ }
+ ref_t = btf_type_by_id(meta->btf, ref_t->type);
+ ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off);
+ if (!btf_type_is_struct(ref_t)) {
+ verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n",
+ meta->func_name, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return KF_ARG_PTR_TO_KPTR;
+ }
+
+ if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_DYNPTR;
+
+ if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_HEAD;
+
+ if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_NODE;
+
+ if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_ROOT;
+
+ if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_NODE;
+
+ if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
+ if (!btf_type_is_struct(ref_t)) {
+ verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return KF_ARG_PTR_TO_BTF_ID;
+ }
+
+ if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_CALLBACK;
+
+ if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
+ arg_mem_size = true;
+
+ /* This is the catch all argument type of register types supported by
+ * check_helper_mem_access. However, we only allow when argument type is
+ * pointer to scalar, or struct composed (recursively) of scalars. When
+ * arg_mem_size is true, the pointer can be void *.
+ */
+ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
+ (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
+ verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+ return -EINVAL;
+ }
+ return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
+}
+
+static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const struct btf_type *ref_t,
+ const char *ref_tname, u32 ref_id,
+ struct bpf_kfunc_call_arg_meta *meta,
+ int argno)
+{
+ const struct btf_type *reg_ref_t;
+ bool strict_type_match = false;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ u32 reg_ref_id;
+
+ if (base_type(reg->type) == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
+ } else {
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[base_type(reg->type)];
+ }
+
+ /* Enforce strict type matching for calls to kfuncs that are acquiring
+ * or releasing a reference, or are no-cast aliases. We do _not_
+ * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
+ * as we want to enable BPF programs to pass types that are bitwise
+ * equivalent without forcing them to explicitly cast with something
+ * like bpf_cast_to_kern_ctx().
+ *
+ * For example, say we had a type like the following:
+ *
+ * struct bpf_cpumask {
+ * cpumask_t cpumask;
+ * refcount_t usage;
+ * };
+ *
+ * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed
+ * to a struct cpumask, so it would be safe to pass a struct
+ * bpf_cpumask * to a kfunc expecting a struct cpumask *.
+ *
+ * The philosophy here is similar to how we allow scalars of different
+ * types to be passed to kfuncs as long as the size is the same. The
+ * only difference here is that we're simply allowing
+ * btf_struct_ids_match() to walk the struct at the 0th offset, and
+ * resolve types.
+ */
+ if (is_kfunc_acquire(meta) ||
+ (is_kfunc_release(meta) && reg->ref_obj_id) ||
+ btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
+ strict_type_match = true;
+
+ WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
+ if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
+ verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
+ btf_type_str(reg_ref_t), reg_ref_tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const struct btf_type *ref_t,
+ const char *ref_tname,
+ struct bpf_kfunc_call_arg_meta *meta,
+ int argno)
+{
+ struct btf_field *kptr_field;
+
+ /* check_func_arg_reg_off allows var_off for
+ * PTR_TO_MAP_VALUE, but we need fixed offset to find
+ * off_desc.
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "arg#0 must have constant offset\n");
+ return -EINVAL;
+ }
+
+ kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
+ if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
+ verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n",
+ reg->off + reg->var_off.value);
+ return -EINVAL;
+ }
+
+ if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id, true)) {
+ verbose(env, "kernel function %s args#%d expected pointer to %s %s\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+
+ if (!state->active_lock.ptr) {
+ verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
+ return -EFAULT;
+ }
+
+ if (type_flag(reg->type) & NON_OWN_REF) {
+ verbose(env, "verifier internal error: NON_OWN_REF already set\n");
+ return -EFAULT;
+ }
+
+ reg->type |= NON_OWN_REF;
+ return 0;
+}
+
+static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
+{
+ struct bpf_func_state *state, *unused;
+ struct bpf_reg_state *reg;
+ int i;
+
+ state = cur_func(env);
+
+ if (!ref_obj_id) {
+ verbose(env, "verifier internal error: ref_obj_id is zero for "
+ "owning -> non-owning conversion\n");
+ return -EFAULT;
+ }
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].id != ref_obj_id)
+ continue;
+
+ /* Clear ref_obj_id here so release_reference doesn't clobber
+ * the whole reg
+ */
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ reg->ref_obj_id = 0;
+ ref_set_non_owning(env, reg);
+ }
+ }));
+ return 0;
+ }
+
+ verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
+ return -EFAULT;
+}
+
+/* Implementation details:
+ *
+ * Each register points to some region of memory, which we define as an
+ * allocation. Each allocation may embed a bpf_spin_lock which protects any
+ * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same
+ * allocation. The lock and the data it protects are colocated in the same
+ * memory region.
+ *
+ * Hence, everytime a register holds a pointer value pointing to such
+ * allocation, the verifier preserves a unique reg->id for it.
+ *
+ * The verifier remembers the lock 'ptr' and the lock 'id' whenever
+ * bpf_spin_lock is called.
+ *
+ * To enable this, lock state in the verifier captures two values:
+ * active_lock.ptr = Register's type specific pointer
+ * active_lock.id = A unique ID for each register pointer value
+ *
+ * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two
+ * supported register types.
+ *
+ * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of
+ * allocated objects is the reg->btf pointer.
+ *
+ * The active_lock.id is non-unique for maps supporting direct_value_addr, as we
+ * can establish the provenance of the map value statically for each distinct
+ * lookup into such maps. They always contain a single map value hence unique
+ * IDs for each pseudo load pessimizes the algorithm and rejects valid programs.
+ *
+ * So, in case of global variables, they use array maps with max_entries = 1,
+ * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point
+ * into the same map value as max_entries is 1, as described above).
+ *
+ * In case of inner map lookups, the inner map pointer has same map_ptr as the
+ * outer map pointer (in verifier context), but each lookup into an inner map
+ * assigns a fresh reg->id to the lookup, so while lookups into distinct inner
+ * maps from the same outer map share the same map_ptr as active_lock.ptr, they
+ * will get different reg->id assigned to each lookup, hence different
+ * active_lock.id.
+ *
+ * In case of allocated objects, active_lock.ptr is the reg->btf, and the
+ * reg->id is a unique ID preserved after the NULL pointer check on the pointer
+ * returned from bpf_obj_new. Each allocation receives a new reg->id.
+ */
+static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ void *ptr;
+ u32 id;
+
+ switch ((int)reg->type) {
+ case PTR_TO_MAP_VALUE:
+ ptr = reg->map_ptr;
+ break;
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
+ ptr = reg->btf;
+ break;
+ default:
+ verbose(env, "verifier internal error: unknown reg type for lock check\n");
+ return -EFAULT;
+ }
+ id = reg->id;
+
+ if (!env->cur_state->active_lock.ptr)
+ return -EINVAL;
+ if (env->cur_state->active_lock.ptr != ptr ||
+ env->cur_state->active_lock.id != id) {
+ verbose(env, "held lock and object are not in the same allocation\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static bool is_bpf_list_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_push_back] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back];
+}
+
+static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_first];
+}
+
+static bool is_bpf_graph_api_kfunc(u32 btf_id)
+{
+ return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id);
+}
+
+static bool is_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add];
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id)
+{
+ return is_bpf_rbtree_api_kfunc(btf_id);
+}
+
+static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env,
+ enum btf_field_type head_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (head_field_type) {
+ case BPF_LIST_HEAD:
+ ret = is_bpf_list_api_kfunc(kfunc_btf_id);
+ break;
+ case BPF_RB_ROOT:
+ ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph root argument type %s\n",
+ btf_field_type_name(head_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s head arg for unknown kfunc\n",
+ btf_field_type_name(head_field_type));
+ return ret;
+}
+
+static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
+ enum btf_field_type node_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (node_field_type) {
+ case BPF_LIST_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]);
+ break;
+ case BPF_RB_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
+ btf_field_type_name(node_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s node arg for unknown kfunc\n",
+ btf_field_type_name(node_field_type));
+ return ret;
+}
+
+static int
+__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ struct btf_field **head_field)
+{
+ const char *head_type_name;
+ struct btf_field *field;
+ struct btf_record *rec;
+ u32 head_off;
+
+ if (meta->btf != btf_vmlinux) {
+ verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
+ return -EFAULT;
+ }
+
+ if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id))
+ return -EFAULT;
+
+ head_type_name = btf_field_type_name(head_field_type);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, head_type_name);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ head_off = reg->off + reg->var_off.value;
+ field = btf_record_find(rec, head_off, head_field_type);
+ if (!field) {
+ verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
+ return -EINVAL;
+ }
+
+ /* All functions require bpf_list_head to be protected using a bpf_spin_lock */
+ if (check_reg_allocation_locked(env, reg)) {
+ verbose(env, "bpf_spin_lock at off=%d must be held for %s\n",
+ rec->spin_lock_off, head_type_name);
+ return -EINVAL;
+ }
+
+ if (*head_field) {
+ verbose(env, "verifier internal error: repeating %s arg\n", head_type_name);
+ return -EFAULT;
+ }
+ *head_field = field;
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
+ &meta->arg_rbtree_root.field);
+}
+
+static int
+__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ enum btf_field_type node_field_type,
+ struct btf_field **node_field)
+{
+ const char *node_type_name;
+ const struct btf_type *et, *t;
+ struct btf_field *field;
+ u32 node_off;
+
+ if (meta->btf != btf_vmlinux) {
+ verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
+ return -EFAULT;
+ }
+
+ if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id))
+ return -EFAULT;
+
+ node_type_name = btf_field_type_name(node_field_type);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, node_type_name);
+ return -EINVAL;
+ }
+
+ node_off = reg->off + reg->var_off.value;
+ field = reg_find_field_offset(reg, node_off, node_field_type);
+ if (!field || field->offset != node_off) {
+ verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
+ return -EINVAL;
+ }
+
+ field = *node_field;
+
+ et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf,
+ field->graph_root.value_btf_id, true)) {
+ verbose(env, "operation on %s expects arg#1 %s at offset=%d "
+ "in struct %s, but arg is at offset=%d in struct %s\n",
+ btf_field_type_name(head_field_type),
+ btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off),
+ node_off, btf_name_by_offset(reg->btf, t->name_off));
+ return -EINVAL;
+ }
+
+ if (node_off != field->graph_root.node_offset) {
+ verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
+ node_off, btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_LIST_HEAD, BPF_LIST_NODE,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_RB_ROOT, BPF_RB_NODE,
+ &meta->arg_rbtree_root.field);
+}
+
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta)
+{
+ const char *func_name = meta->func_name, *ref_tname;
+ const struct btf *btf = meta->btf;
+ const struct btf_param *args;
+ u32 i, nargs;
+ int ret;
+
+ args = (const struct btf_param *)(meta->func_proto + 1);
+ nargs = btf_type_vlen(meta->func_proto);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
+ MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
+ }
+
+ /* Check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < nargs; i++) {
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
+ const struct btf_type *t, *ref_t, *resolve_ret;
+ enum bpf_arg_type arg_type = ARG_DONTCARE;
+ u32 regno = i + 1, ref_id, type_size;
+ bool is_ret_buf_sz = false;
+ int kf_arg_type;
+
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+
+ if (is_kfunc_arg_ignore(btf, &args[i]))
+ continue;
+
+ if (btf_type_is_scalar(t)) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+
+ if (is_kfunc_arg_constant(meta->btf, &args[i])) {
+ if (meta->arg_constant.found) {
+ verbose(env, "verifier internal error: only one constant argument permitted\n");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno);
+ return -EINVAL;
+ }
+ ret = mark_chain_precision(env, regno);
+ if (ret < 0)
+ return ret;
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = reg->var_off.value;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
+ meta->r0_rdonly = true;
+ is_ret_buf_sz = true;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
+ is_ret_buf_sz = true;
+ }
+
+ if (is_ret_buf_sz) {
+ if (meta->r0_size) {
+ verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+ return -EINVAL;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a const\n", regno);
+ return -EINVAL;
+ }
+
+ meta->r0_size = reg->var_off.value;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ }
+ continue;
+ }
+
+ if (!btf_type_is_ptr(t)) {
+ verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if (is_kfunc_trusted_args(meta) &&
+ (register_is_null(reg) || type_may_be_null(reg->type))) {
+ verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+ return -EACCES;
+ }
+
+ if (reg->ref_obj_id) {
+ if (is_kfunc_release(meta) && meta->ref_obj_id) {
+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
+ if (is_kfunc_release(meta))
+ meta->release_regno = regno;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+ kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
+ if (kf_arg_type < 0)
+ return kf_arg_type;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ case KF_ARG_PTR_TO_BTF_ID:
+ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
+ break;
+
+ if (!is_trusted_reg(reg)) {
+ if (!is_kfunc_rcu(meta)) {
+ verbose(env, "R%d must be referenced or trusted\n", regno);
+ return -EINVAL;
+ }
+ if (!is_rcu_reg(reg)) {
+ verbose(env, "R%d must be a rcu pointer\n", regno);
+ return -EINVAL;
+ }
+ }
+
+ fallthrough;
+ case KF_ARG_PTR_TO_CTX:
+ /* Trusted arguments have the same offset checks as release arguments */
+ arg_type |= OBJ_RELEASE;
+ break;
+ case KF_ARG_PTR_TO_KPTR:
+ case KF_ARG_PTR_TO_DYNPTR:
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ case KF_ARG_PTR_TO_LIST_NODE:
+ case KF_ARG_PTR_TO_RB_ROOT:
+ case KF_ARG_PTR_TO_RB_NODE:
+ case KF_ARG_PTR_TO_MEM:
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ case KF_ARG_PTR_TO_CALLBACK:
+ /* Trusted by default */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EFAULT;
+ }
+
+ if (is_kfunc_release(meta) && reg->ref_obj_id)
+ arg_type |= OBJ_RELEASE;
+ ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (ret < 0)
+ return ret;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_CTX:
+ if (reg->type != PTR_TO_CTX) {
+ verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
+ if (ret < 0)
+ return -EINVAL;
+ meta->ret_btf_id = ret;
+ }
+ break;
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ if (meta->btf == btf_vmlinux &&
+ meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ meta->arg_obj_drop.btf = reg->btf;
+ meta->arg_obj_drop.btf_id = reg->btf_id;
+ }
+ break;
+ case KF_ARG_PTR_TO_KPTR:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#0 expected pointer to map value\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_DYNPTR:
+ if (reg->type != PTR_TO_STACK &&
+ reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
+ return -EINVAL;
+ }
+
+ ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RB_ROOT:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_NODE:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RB_NODE:
+ if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
+ if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
+ verbose(env, "rbtree_remove node input must be non-owning ref\n");
+ return -EINVAL;
+ }
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "rbtree_remove not allowed in rbtree cb\n");
+ return -EINVAL;
+ }
+ } else {
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ }
+
+ ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_BTF_ID:
+ /* Only base_type is checked, further checks are done here */
+ if ((base_type(reg->type) != PTR_TO_BTF_ID ||
+ (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
+ !reg2btf_ids[base_type(reg->type)]) {
+ verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
+ verbose(env, "expected %s or socket\n",
+ reg_type_str(env, base_type(reg->type) |
+ (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM:
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
+ verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
+ return -EINVAL;
+ }
+ ret = check_mem_reg(env, reg, regno, type_size);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ ret = check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1);
+ if (ret < 0) {
+ verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+ return ret;
+ }
+ /* Skip next '__sz' argument */
+ i++;
+ break;
+ case KF_ARG_PTR_TO_CALLBACK:
+ meta->subprogno = reg->subprogno;
+ break;
+ }
+ }
+
+ if (is_kfunc_release(meta) && !meta->release_regno) {
+ verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
const struct btf_type *t, *func, *func_proto, *ptr_type;
+ u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id;
struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_kfunc_arg_meta meta = { 0 };
const char *func_name, *ptr_type_name;
- u32 i, nargs, func_id, ptr_type_id;
+ bool sleepable, rcu_lock, rcu_unlock;
+ struct bpf_kfunc_call_arg_meta meta;
int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
+ const struct btf_type *ret_t;
struct btf *desc_btf;
u32 *kfunc_flags;
- bool acq;
/* skip for now, but return error when we find this in fixup_kfunc_call */
if (!insn->imm)
@@ -7700,24 +9806,68 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_name);
return -EACCES;
}
- if (*kfunc_flags & KF_DESTRUCTIVE && !capable(CAP_SYS_BOOT)) {
- verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capabilities\n");
+
+ /* Prepare kfunc call metadata */
+ memset(&meta, 0, sizeof(meta));
+ meta.btf = desc_btf;
+ meta.func_id = func_id;
+ meta.kfunc_flags = *kfunc_flags;
+ meta.func_proto = func_proto;
+ meta.func_name = func_name;
+
+ if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
+ verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
+ return -EACCES;
+ }
+
+ sleepable = is_kfunc_sleepable(&meta);
+ if (sleepable && !env->prog->aux->sleepable) {
+ verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
return -EACCES;
}
- acq = *kfunc_flags & KF_ACQUIRE;
+ rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
+ rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+ if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) {
+ verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name);
+ return -EACCES;
+ }
- meta.flags = *kfunc_flags;
+ if (env->cur_state->active_rcu_lock) {
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ if (rcu_lock) {
+ verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ } else if (rcu_unlock) {
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->type & MEM_RCU) {
+ reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
+ reg->type |= PTR_UNTRUSTED;
+ }
+ }));
+ env->cur_state->active_rcu_lock = false;
+ } else if (sleepable) {
+ verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+ return -EACCES;
+ }
+ } else if (rcu_lock) {
+ env->cur_state->active_rcu_lock = true;
+ } else if (rcu_unlock) {
+ verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
/* Check the arguments */
- err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta);
+ err = check_kfunc_args(env, &meta);
if (err < 0)
return err;
/* In case of release function, we get register number of refcounted
- * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now
+ * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
- if (err) {
- err = release_reference(env, regs[err].ref_obj_id);
+ if (meta.release_regno) {
+ err = release_reference(env, regs[meta.release_regno].ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
func_name, func_id);
@@ -7725,24 +9875,128 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_push_back] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+ release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+ err = ref_convert_owning_non_owning(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
+ func_name, func_id);
+ return err;
+ }
+
+ err = release_reference(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, func_id);
+ return err;
+ }
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_rbtree_add_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, func_id);
+ return err;
+ }
+ }
+
for (i = 0; i < CALLER_SAVED_REGS; i++)
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
- if (acq && !btf_type_is_struct_ptr(desc_btf, t)) {
- verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
- return -EINVAL;
+ if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
+ /* Only exception is bpf_obj_new_impl */
+ if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) {
+ verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+ return -EINVAL;
+ }
}
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
} else if (btf_type_is_ptr(t)) {
- ptr_type = btf_type_skip_modifiers(desc_btf, t->type,
- &ptr_type_id);
- if (!btf_type_is_struct(ptr_type)) {
+ ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
+
+ if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+
+ if (unlikely(!bpf_global_ma_set))
+ return -ENOMEM;
+
+ if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
+ verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+ return -EINVAL;
+ }
+
+ ret_btf = env->prog->aux->btf;
+ ret_btf_id = meta.arg_constant.value;
+
+ /* This may be NULL due to user not supplying a BTF */
+ if (!ret_btf) {
+ verbose(env, "bpf_obj_new requires prog BTF\n");
+ return -EINVAL;
+ }
+
+ ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+ if (!ret_t || !__btf_type_is_struct(ret_t)) {
+ verbose(env, "bpf_obj_new type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+
+ env->insn_aux_data[insn_idx].obj_new_size = ret_t->size;
+ env->insn_aux_data[insn_idx].kptr_struct_meta =
+ btf_find_struct_meta(ret_btf, ret_btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ env->insn_aux_data[insn_idx].kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_obj_drop.btf,
+ meta.arg_obj_drop.btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
+ struct btf_field *field = meta.arg_list_head.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ struct btf_field *field = meta.arg_rbtree_root.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
+ if (!ret_t || !btf_type_is_struct(ret_t)) {
+ verbose(env,
+ "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+ } else {
+ verbose(env, "kernel function %s unhandled dynamic return type\n",
+ meta.func_name);
+ return -EFAULT;
+ }
+ } else if (!__btf_type_is_struct(ptr_type)) {
if (!meta.r0_size) {
ptr_type_name = btf_name_by_offset(desc_btf,
ptr_type->name_off);
@@ -7770,20 +10024,30 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
}
- if (*kfunc_flags & KF_RET_NULL) {
+
+ if (is_kfunc_ret_null(&meta)) {
regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
regs[BPF_REG_0].id = ++env->id_gen;
}
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
- if (acq) {
+ if (is_kfunc_acquire(&meta)) {
int id = acquire_reference_state(env, insn_idx);
if (id < 0)
return id;
- regs[BPF_REG_0].id = id;
+ if (is_kfunc_ret_null(&meta))
+ regs[BPF_REG_0].id = id;
regs[BPF_REG_0].ref_obj_id = id;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ ref_set_non_owning(env, &regs[BPF_REG_0]);
}
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove])
+ invalidate_non_owning_refs(env);
+
+ if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
+ regs[BPF_REG_0].id = ++env->id_gen;
} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
nargs = btf_type_vlen(func_proto);
@@ -8068,7 +10332,7 @@ do_sim:
*/
if (!ptr_is_dst_reg) {
tmp = *dst_reg;
- *dst_reg = *ptr_reg;
+ copy_register_state(dst_reg, ptr_reg);
}
ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
env->insn_idx);
@@ -9211,6 +11475,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
return err;
return adjust_ptr_min_max_vals(env, insn,
dst_reg, src_reg);
+ } else if (dst_reg->precise) {
+ /* if dst_reg is precise, src_reg should be precise as well */
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
}
} else {
/* Pretend the src is a reg with a known value, since we only
@@ -9316,7 +11585,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* to propagate min/max range.
*/
src_reg->id = ++env->id_gen;
- *dst_reg = *src_reg;
+ copy_register_state(dst_reg, src_reg);
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
@@ -9327,7 +11596,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
- *dst_reg = *src_reg;
+ copy_register_state(dst_reg, src_reg);
/* Make sure ID is cleared otherwise
* dst_reg min/max could be incorrectly
* propagated into src_reg by find_equal_scalars()
@@ -9950,17 +12219,22 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
bool is_null)
{
if (type_may_be_null(reg->type) && reg->id == id &&
- !WARN_ON_ONCE(!reg->id)) {
- if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
- !tnum_equals_const(reg->var_off, 0) ||
- reg->off)) {
- /* Old offset (both fixed and variable parts) should
- * have been known-zero, because we don't allow pointer
- * arithmetic on pointers that might be NULL. If we
- * see this happening, don't convert the register.
- */
+ (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
+ /* Old offset (both fixed and variable parts) should have been
+ * known-zero, because we don't allow pointer arithmetic on
+ * pointers that might be NULL. If we see this happening, don't
+ * convert the register.
+ *
+ * But in some cases, some helpers that return local kptrs
+ * advance offset for the returned pointer. In those cases, it
+ * is fine to expect to see reg->off.
+ */
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
return;
- }
+ if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
+ WARN_ON_ONCE(reg->off))
+ return;
+
if (is_null) {
reg->type = SCALAR_VALUE;
/* We don't need id and ref_obj_id from this point
@@ -10123,7 +12397,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
+ copy_register_state(reg, known_reg);
}));
}
@@ -10134,6 +12408,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_verifier_state *other_branch;
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
+ struct bpf_reg_state *eq_branch_regs;
u8 opcode = BPF_OP(insn->code);
bool is_jmp32;
int pred = -1;
@@ -10243,8 +12518,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
* this is only legit if both are scalars (or pointers to the same
- * object, I suppose, but we don't support that right now), because
- * otherwise the different base pointers mean the offsets aren't
+ * object, I suppose, see the PTR_MAYBE_NULL related if block below),
+ * because otherwise the different base pointers mean the offsets aren't
* comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
@@ -10293,6 +12568,43 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
}
+ /* if one pointer register is compared to another pointer
+ * register check if PTR_MAYBE_NULL could be lifted.
+ * E.g. register A - maybe null
+ * register B - not null
+ * for JNE A, B, ... - A is not null in the false branch;
+ * for JEQ A, B, ... - A is not null in the true branch.
+ *
+ * Since PTR_TO_BTF_ID points to a kernel struct that does
+ * not need to be null checked by the BPF program, i.e.,
+ * could be null even without PTR_MAYBE_NULL marking, so
+ * only propagate nullness when neither reg is that type.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
+ __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
+ type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) &&
+ base_type(src_reg->type) != PTR_TO_BTF_ID &&
+ base_type(dst_reg->type) != PTR_TO_BTF_ID) {
+ eq_branch_regs = NULL;
+ switch (opcode) {
+ case BPF_JEQ:
+ eq_branch_regs = other_branch_regs;
+ break;
+ case BPF_JNE:
+ eq_branch_regs = regs;
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+ if (eq_branch_regs) {
+ if (type_may_be_null(src_reg->type))
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]);
+ else
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
+ }
+ }
+
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
* NOTE: these optimizations below are related with pointer comparison
* which will never be JMP32.
@@ -10399,8 +12711,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
- if (map_value_has_spin_lock(map))
- dst_reg->id = ++env->id_gen;
+ WARN_ON_ONCE(map->max_entries != 1);
+ /* We want reg->id to be same (0) as map_value is not distinct */
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
@@ -10478,11 +12790,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
- if (env->cur_state->active_spin_lock) {
+ if (env->cur_state->active_lock.ptr) {
verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
return -EINVAL;
}
+ if (env->cur_state->active_rcu_lock) {
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
+ return -EINVAL;
+ }
+
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -10684,7 +13001,7 @@ static int check_return_code(struct bpf_verifier_env *env)
* 3 let S be a stack
* 4 S.push(v)
* 5 while S is not empty
- * 6 t <- S.pop()
+ * 6 t <- S.peek()
* 7 if t is what we're looking for:
* 8 return t
* 9 for all edges e in G.adjacentEdges(t) do
@@ -10733,11 +13050,16 @@ static struct bpf_verifier_state_list **explored_state(
return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
}
-static void init_explored_state(struct bpf_verifier_env *env, int idx)
+static void mark_prune_point(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].prune_point = true;
}
+static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].prune_point;
+}
+
enum {
DONE_EXPLORING = 0,
KEEP_EXPLORING = 1,
@@ -10766,9 +13088,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return -EINVAL;
}
- if (e == BRANCH)
+ if (e == BRANCH) {
/* mark branch target for state pruning */
- init_explored_state(env, w);
+ mark_prune_point(env, w);
+ mark_jmp_point(env, w);
+ }
if (insn_state[w] == 0) {
/* tree-edge */
@@ -10795,8 +13119,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return DONE_EXPLORING;
}
-static int visit_func_call_insn(int t, int insn_cnt,
- struct bpf_insn *insns,
+static int visit_func_call_insn(int t, struct bpf_insn *insns,
struct bpf_verifier_env *env,
bool visit_callee)
{
@@ -10806,10 +13129,12 @@ static int visit_func_call_insn(int t, int insn_cnt,
if (ret)
return ret;
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
+ mark_prune_point(env, t + 1);
+ /* when we exit from subprog, we need to record non-linear history */
+ mark_jmp_point(env, t + 1);
+
if (visit_callee) {
- init_explored_state(env, t);
+ mark_prune_point(env, t);
ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
/* It's ok to allow recursion from CFG point of
* view. __check_func_call() will do the actual
@@ -10825,13 +13150,13 @@ static int visit_func_call_insn(int t, int insn_cnt,
* DONE_EXPLORING - the instruction was fully explored
* KEEP_EXPLORING - there is still work to be done before it is fully explored
*/
-static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+static int visit_insn(int t, struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int ret;
if (bpf_pseudo_func(insns + t))
- return visit_func_call_insn(t, insn_cnt, insns, env, true);
+ return visit_func_call_insn(t, insns, env, true);
/* All non-branch instructions have a single fall-through edge. */
if (BPF_CLASS(insns[t].code) != BPF_JMP &&
@@ -10844,13 +13169,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
case BPF_CALL:
if (insns[t].imm == BPF_FUNC_timer_set_callback)
- /* Mark this call insn to trigger is_state_visited() check
- * before call itself is processed by __check_func_call().
- * Otherwise new async state will be pushed for further
- * exploration.
+ /* Mark this call insn as a prune point to trigger
+ * is_state_visited() check before call itself is
+ * processed by __check_func_call(). Otherwise new
+ * async state will be pushed for further exploration.
*/
- init_explored_state(env, t);
- return visit_func_call_insn(t, insn_cnt, insns, env,
+ mark_prune_point(env, t);
+ return visit_func_call_insn(t, insns, env,
insns[t].src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
@@ -10863,22 +13188,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
if (ret)
return ret;
- /* unconditional jmp is not a good pruning point,
- * but it's marked, since backtracking needs
- * to record jmp history in is_state_visited().
- */
- init_explored_state(env, t + insns[t].off + 1);
- /* tell verifier to check for equivalent states
- * after every call and jump
- */
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
+ mark_prune_point(env, t + insns[t].off + 1);
+ mark_jmp_point(env, t + insns[t].off + 1);
return ret;
default:
/* conditional jump with two edges */
- init_explored_state(env, t);
+ mark_prune_point(env, t);
+
ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
if (ret)
return ret;
@@ -10914,7 +13232,7 @@ static int check_cfg(struct bpf_verifier_env *env)
while (env->cfg.cur_stack > 0) {
int t = insn_stack[env->cfg.cur_stack - 1];
- ret = visit_insn(t, insn_cnt, env);
+ ret = visit_insn(t, env);
switch (ret) {
case DONE_EXPLORING:
insn_state[t] = EXPLORED;
@@ -11382,6 +13700,13 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
{
unsigned int i;
+ /* either both IDs should be set or both should be zero */
+ if (!!old_id != !!cur_id)
+ return false;
+
+ if (old_id == 0) /* cur_id == 0 as well */
+ return true;
+
for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
if (!idmap[i].old) {
/* Reached an empty slot; haven't seen this id before */
@@ -11493,85 +13818,74 @@ next:
}
}
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_id_pair *idmap)
+{
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+}
+
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
{
- bool equal;
-
if (!(rold->live & REG_LIVE_READ))
/* explored state didn't use this */
return true;
-
- equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
-
- if (rold->type == PTR_TO_STACK)
- /* two stack pointers are equal only if they're pointing to
- * the same stack frame, since fp-8 in foo != fp-8 in bar
- */
- return equal && rold->frameno == rcur->frameno;
-
- if (equal)
- return true;
-
if (rold->type == NOT_INIT)
/* explored state can't have used this */
return true;
if (rcur->type == NOT_INIT)
return false;
+
+ /* Enforce that register types have to match exactly, including their
+ * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
+ * rule.
+ *
+ * One can make a point that using a pointer register as unbounded
+ * SCALAR would be technically acceptable, but this could lead to
+ * pointer leaks because scalars are allowed to leak while pointers
+ * are not. We could make this safe in special cases if root is
+ * calling us, but it's probably not worth the hassle.
+ *
+ * Also, register types that are *not* MAYBE_NULL could technically be
+ * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
+ * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
+ * to the same map).
+ * However, if the old MAYBE_NULL register then got NULL checked,
+ * doing so could have affected others with the same id, and we can't
+ * check for that because we lost the id when we converted to
+ * a non-MAYBE_NULL variant.
+ * So, as a general rule we don't allow mixing MAYBE_NULL and
+ * non-MAYBE_NULL registers as well.
+ */
+ if (rold->type != rcur->type)
+ return false;
+
switch (base_type(rold->type)) {
case SCALAR_VALUE:
+ if (regs_exact(rold, rcur, idmap))
+ return true;
if (env->explore_alu_limits)
return false;
- if (rcur->type == SCALAR_VALUE) {
- if (!rold->precise && !rcur->precise)
- return true;
- /* new val must satisfy old val knowledge */
- return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
- } else {
- /* We're trying to use a pointer in place of a scalar.
- * Even if the scalar was unbounded, this could lead to
- * pointer leaks because scalars are allowed to leak
- * while pointers are not. We could make this safe in
- * special cases if root is calling us, but it's
- * probably not worth the hassle.
- */
- return false;
- }
+ if (!rold->precise)
+ return true;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
- /* a PTR_TO_MAP_VALUE could be safe to use as a
- * PTR_TO_MAP_VALUE_OR_NULL into the same map.
- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
- * checked, doing so could have affected others with the same
- * id, and we can't check for that because we lost the id when
- * we converted to a PTR_TO_MAP_VALUE.
- */
- if (type_may_be_null(rold->type)) {
- if (!type_may_be_null(rcur->type))
- return false;
- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
- return false;
- /* Check our ids match any regs they're supposed to */
- return check_ids(rold->id, rcur->id, idmap);
- }
-
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
- * 'id' is not compared, since it's only used for maps with
- * bpf_spin_lock inside map element and in such cases if
- * the rest of the prog is valid for one map element then
- * it's valid for all map elements regardless of the key
- * used in bpf_map_lookup()
*/
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_ids(rold->id, rcur->id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
- if (rcur->type != rold->type)
- return false;
/* We must have at least as much range as the old ptr
* did, so that any accesses which were safe before are
* still safe. This is true even if old range < old off,
@@ -11586,30 +13900,19 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
if (rold->off != rcur->off)
return false;
/* id relations must be preserved */
- if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+ if (!check_ids(rold->id, rcur->id, idmap))
return false;
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_CTX:
- case CONST_PTR_TO_MAP:
- case PTR_TO_PACKET_END:
- case PTR_TO_FLOW_KEYS:
- case PTR_TO_SOCKET:
- case PTR_TO_SOCK_COMMON:
- case PTR_TO_TCP_SOCK:
- case PTR_TO_XDP_SOCK:
- /* Only valid matches are exact, which memcmp() above
- * would have accepted
+ case PTR_TO_STACK:
+ /* two stack pointers are equal only if they're pointing to
+ * the same stack frame, since fp-8 in foo != fp-8 in bar
*/
+ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
default:
- /* Don't know what's going on, just say it's not safe */
- return false;
+ return regs_exact(rold, rcur, idmap);
}
-
- /* Shouldn't get here; if we do, say it's not safe */
- WARN_ON_ONCE(1);
- return false;
}
static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
@@ -11656,10 +13959,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return false;
if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
continue;
- if (!is_spilled_reg(&old->stack[spi]))
- continue;
- if (!regsafe(env, &old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr, idmap))
+ /* Both old and cur are having same slot_type */
+ switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
/* when explored and current stack slot are both storing
* spilled registers, check that stored pointers types
* are the same as well.
@@ -11670,17 +13972,48 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap))
+ return false;
+ break;
+ case STACK_DYNPTR:
+ {
+ const struct bpf_reg_state *old_reg, *cur_reg;
+
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (old_reg->dynptr.type != cur_reg->dynptr.type ||
+ old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ }
+ case STACK_MISC:
+ case STACK_ZERO:
+ case STACK_INVALID:
+ continue;
+ /* Ensure that new unhandled slot types return false by default */
+ default:
return false;
+ }
}
return true;
}
-static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
+static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
+ struct bpf_id_pair *idmap)
{
+ int i;
+
if (old->acquired_refs != cur->acquired_refs)
return false;
- return !memcmp(old->refs, cur->refs,
- sizeof(*old->refs) * old->acquired_refs);
+
+ for (i = 0; i < old->acquired_refs; i++) {
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap))
+ return false;
+ }
+
+ return true;
}
/* compare two verifier states
@@ -11714,7 +14047,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
{
int i;
- memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
for (i = 0; i < MAX_BPF_REG; i++)
if (!regsafe(env, &old->regs[i], &cur->regs[i],
env->idmap_scratch))
@@ -11723,7 +14055,7 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
if (!stacksafe(env, old, cur, env->idmap_scratch))
return false;
- if (!refsafe(old, cur))
+ if (!refsafe(old, cur, env->idmap_scratch))
return false;
return true;
@@ -11738,13 +14070,28 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->curframe != cur->curframe)
return false;
+ memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+
/* Verification state from speculative execution simulation
* must never prune a non-speculative execution one.
*/
if (old->speculative && !cur->speculative)
return false;
- if (old->active_spin_lock != cur->active_spin_lock)
+ if (old->active_lock.ptr != cur->active_lock.ptr)
+ return false;
+
+ /* Old and cur active_lock's have to be either both present
+ * or both absent.
+ */
+ if (!!old->active_lock.id != !!cur->active_lock.id)
+ return false;
+
+ if (old->active_lock.id &&
+ !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch))
+ return false;
+
+ if (old->active_rcu_lock != cur->active_rcu_lock)
return false;
/* for states to be equal callsites have to be the same
@@ -11847,34 +14194,36 @@ static int propagate_precision(struct bpf_verifier_env *env,
{
struct bpf_reg_state *state_reg;
struct bpf_func_state *state;
- int i, err = 0;
+ int i, err = 0, fr;
- state = old->frame[old->curframe];
- state_reg = state->regs;
- for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating r%d\n", i);
- err = mark_chain_precision(env, i);
- if (err < 0)
- return err;
- }
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ state_reg = state->regs;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame %d: propagating r%d\n", i, fr);
+ err = mark_chain_precision_frame(env, fr, i);
+ if (err < 0)
+ return err;
+ }
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (!is_spilled_reg(&state->stack[i]))
- continue;
- state_reg = &state->stack[i].spilled_ptr;
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating fp%d\n",
- (-i - 1) * BPF_REG_SIZE);
- err = mark_chain_precision_stack(env, i);
- if (err < 0)
- return err;
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!is_spilled_reg(&state->stack[i]))
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame %d: propagating fp%d\n",
+ (-i - 1) * BPF_REG_SIZE, fr);
+ err = mark_chain_precision_stack_frame(env, fr, i);
+ if (err < 0)
+ return err;
+ }
}
return 0;
}
@@ -11906,13 +14255,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
int i, j, err, states_cnt = 0;
bool add_new_state = env->test_state_freq ? true : false;
- cur->last_insn_idx = env->prev_insn_idx;
- if (!env->insn_aux_data[insn_idx].prune_point)
- /* this 'insn_idx' instruction wasn't marked, so we will not
- * be doing state search here
- */
- return 0;
-
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
* Do not add new state for future pruning if the verifier hasn't seen
@@ -12047,10 +14389,10 @@ next:
env->max_states_per_insn = states_cnt;
if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
- return push_jmp_history(env, cur);
+ return 0;
if (!add_new_state)
- return push_jmp_history(env, cur);
+ return 0;
/* There were no equivalent states, remember the current one.
* Technically the current state is not proven to be safe yet,
@@ -12069,6 +14411,10 @@ next:
env->prev_jmps_processed = env->jmps_processed;
env->prev_insn_processed = env->insn_processed;
+ /* forget precise markings we inherited, see __mark_chain_precision */
+ if (env->bpf_capable)
+ mark_all_scalars_imprecise(env, cur);
+
/* add new state to the head of linked list */
new = &new_sl->state;
err = copy_verifier_state(new, cur);
@@ -12186,21 +14532,31 @@ static int do_check(struct bpf_verifier_env *env)
return -E2BIG;
}
- err = is_state_visited(env, env->insn_idx);
- if (err < 0)
- return err;
- if (err == 1) {
- /* found equivalent state, can prune the search */
- if (env->log.level & BPF_LOG_LEVEL) {
- if (do_print_state)
- verbose(env, "\nfrom %d to %d%s: safe\n",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- else
- verbose(env, "%d: safe\n", env->insn_idx);
+ state->last_insn_idx = env->prev_insn_idx;
+
+ if (is_prune_point(env, env->insn_idx)) {
+ err = is_state_visited(env, env->insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (env->log.level & BPF_LOG_LEVEL) {
+ if (do_print_state)
+ verbose(env, "\nfrom %d to %d%s: safe\n",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ else
+ verbose(env, "%d: safe\n", env->insn_idx);
+ }
+ goto process_bpf_exit;
}
- goto process_bpf_exit;
+ }
+
+ if (is_jmp_point(env, env->insn_idx)) {
+ err = push_jmp_history(env, state);
+ if (err)
+ return err;
}
if (signal_pending(current))
@@ -12236,7 +14592,7 @@ static int do_check(struct bpf_verifier_env *env)
env->prev_log_len = env->log.len_used;
}
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
err = bpf_prog_offload_verify_insn(env, env->insn_idx,
env->prev_insn_idx);
if (err)
@@ -12383,11 +14739,14 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_spin_lock &&
- (insn->src_reg == BPF_PSEUDO_CALL ||
- insn->imm != BPF_FUNC_spin_unlock)) {
- verbose(env, "function calls are not allowed while holding a lock\n");
- return -EINVAL;
+ if (env->cur_state->active_lock.ptr) {
+ if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
+ (insn->src_reg == BPF_PSEUDO_CALL) ||
+ (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
+ verbose(env, "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
}
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
@@ -12420,11 +14779,17 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_spin_lock) {
+ if (env->cur_state->active_lock.ptr &&
+ !in_rbtree_lock_required_cb(env)) {
verbose(env, "bpf_spin_unlock is missing\n");
return -EINVAL;
}
+ if (env->cur_state->active_rcu_lock) {
+ verbose(env, "bpf_rcu_read_unlock is missing\n");
+ return -EINVAL;
+ }
+
/* We must do check_reference_leak here before
* prepare_func_exit to handle the case when
* state->curframe > 0, it may be a callback
@@ -12677,7 +15042,15 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
- if (map_value_has_spin_lock(map)) {
+ if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
+ btf_record_has_field(map->record, BPF_RB_ROOT)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");
+ return -EINVAL;
+ }
+ }
+
+ if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
@@ -12694,14 +15067,14 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if (map_value_has_timer(map)) {
+ if (btf_record_has_field(map->record, BPF_TIMER)) {
if (is_tracing_prog_type(prog_type)) {
verbose(env, "tracing progs cannot use bpf_timer yet\n");
return -EINVAL;
}
}
- if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+ if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
return -EINVAL;
@@ -12727,10 +15100,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_INODE_STORAGE:
case BPF_MAP_TYPE_SK_STORAGE:
case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
break;
default:
verbose(env,
- "Sleepable programs can only use array, hash, and ringbuf maps\n");
+ "Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
return -EINVAL;
}
@@ -13181,7 +15555,7 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
unsigned int orig_prog_len = env->prog->len;
int err;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_remove_insns(env, off, cnt);
err = bpf_remove_insns(env->prog, off, cnt);
@@ -13262,7 +15636,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
else
continue;
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
bpf_prog_offload_replace_insn(env, i, &ja);
memcpy(insn, &ja, sizeof(ja));
@@ -13386,6 +15760,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
continue;
+ /* Zero-extension is done by the caller. */
+ if (bpf_pseudo_kfunc_call(&insn))
+ continue;
+
if (WARN_ON(load_reg == -1)) {
verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
return -EFAULT;
@@ -13445,7 +15823,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
}
- if (bpf_prog_is_dev_bound(env->prog->aux))
+ if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
insn = env->prog->insnsi + delta;
@@ -13513,6 +15891,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
break;
case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+ /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
+ * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+ * be said once it is marked PTR_UNTRUSTED, hence we must handle
+ * any faults for loads into such types. BPF_WRITE is disallowed
+ * for this case.
+ */
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
if (type == BPF_READ) {
insn->code = BPF_LDX | BPF_PROBE_MEM |
BPF_SIZE((insn)->code);
@@ -13838,7 +16223,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
int err = 0;
if (env->prog->jit_requested &&
- !bpf_prog_is_dev_bound(env->prog->aux)) {
+ !bpf_prog_is_offloaded(env->prog->aux)) {
err = jit_subprogs(env);
if (err == 0)
return 0;
@@ -13878,18 +16263,31 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
-static int fixup_kfunc_call(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
const struct bpf_kfunc_desc *desc;
+ void *xdp_kfunc;
if (!insn->imm) {
verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
return -EINVAL;
}
+ *cnt = 0;
+
+ if (bpf_dev_bound_kfunc_id(insn->imm)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm);
+ if (xdp_kfunc) {
+ insn->imm = BPF_CALL_IMM(xdp_kfunc);
+ return 0;
+ }
+
+ /* fallback to default kfunc when not supported by netdev */
+ }
+
/* insn->imm has the btf func_id. Replace it with
- * an address (relative to __bpf_base_call).
+ * an address (relative to __bpf_call_base).
*/
desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
if (!desc) {
@@ -13899,7 +16297,31 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
}
insn->imm = desc->imm;
-
+ if (insn->off)
+ return 0;
+ if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
+ insn_buf[1] = addr[0];
+ insn_buf[2] = addr[1];
+ insn_buf[3] = *insn;
+ *cnt = 4;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *cnt = 1;
+ }
return 0;
}
@@ -14041,9 +16463,19 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
if (insn->src_reg == BPF_PSEUDO_CALL)
continue;
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- ret = fixup_kfunc_call(env, insn);
+ ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
if (ret)
return ret;
+ if (cnt == 0)
+ continue;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
@@ -14161,13 +16593,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
- if (insn->imm == BPF_FUNC_task_storage_get ||
- insn->imm == BPF_FUNC_sk_storage_get ||
- insn->imm == BPF_FUNC_inode_storage_get) {
- if (env->prog->aux->sleepable)
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
- else
+ if (is_storage_get_function(insn->imm)) {
+ if (!env->prog->aux->sleepable ||
+ env->insn_aux_data[i + delta].storage_get_func_atomic)
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+ else
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
insn_buf[1] = *insn;
cnt = 2;
@@ -14237,7 +16668,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
(int (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_redirect,
- (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+ (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
(int (*)(struct bpf_map *map,
bpf_callback_t callback_fn,
@@ -14616,6 +17047,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
BPF_MAIN_FUNC /* callsite */,
0 /* frameno */,
subprog);
+ state->first_insn_idx = env->subprog_info[subprog].start;
+ state->last_insn_idx = -1;
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
@@ -14788,7 +17221,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
if (st_ops->check_member) {
- int err = st_ops->check_member(t, member);
+ int err = st_ops->check_member(t, member, prog);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
@@ -14869,6 +17302,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
+ if (bpf_prog_is_dev_bound(prog->aux) &&
+ !bpf_prog_dev_bound_match(prog, tgt_prog)) {
+ bpf_log(log, "Target program bound device mismatch");
+ return -EINVAL;
+ }
+
for (i = 0; i < aux->func_info_cnt; i++)
if (aux->func_info[i].type_id == btf_id) {
subprog = i;
@@ -15025,12 +17464,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
ret = -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
- /* fentry/fexit/fmod_ret progs can be sleepable only if they are
+
+ /* fentry/fexit/fmod_ret progs can be sleepable if they are
* attached to ALLOW_ERROR_INJECTION and are not in denylist.
*/
if (!check_non_sleepable_error_inject(btf_id) &&
within_error_injection_list(addr))
ret = 0;
+ /* fentry/fexit/fmod_ret progs can also be sleepable if they are
+ * in the fmodret id set with the KF_SLEEPABLE flag.
+ */
+ else {
+ u32 *flags = btf_kfunc_is_modify_return(btf, btf_id);
+
+ if (flags && (*flags & KF_SLEEPABLE))
+ ret = 0;
+ }
break;
case BPF_PROG_TYPE_LSM:
/* LSM progs check that they are attached to bpf_lsm_*() funcs.
@@ -15051,7 +17500,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
bpf_log(log, "can't modify return codes of BPF programs\n");
return -EINVAL;
}
- ret = check_attach_modify_return(addr, tname);
+ ret = -EINVAL;
+ if (btf_kfunc_is_modify_return(btf, btf_id) ||
+ !check_attach_modify_return(addr, tname))
+ ret = 0;
if (ret) {
bpf_log(log, "%s() is not modifiable\n", tname);
return ret;
@@ -15077,6 +17529,24 @@ BTF_ID(func, rcu_read_unlock_strict)
#endif
BTF_SET_END(btf_id_deny)
+static bool can_be_sleepable(struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_TRACING) {
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ case BPF_TRACE_ITER:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return prog->type == BPF_PROG_TYPE_LSM ||
+ prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
+ prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+}
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
@@ -15095,9 +17565,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
- prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) {
- verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n");
+ if (prog->aux->sleepable && !can_be_sleepable(prog)) {
+ verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
return -EINVAL;
}
@@ -15240,10 +17709,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
env->allow_ptr_leaks = bpf_allow_ptr_leaks();
env->allow_uninit_stack = bpf_allow_uninit_stack();
- env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
env->bypass_spec_v1 = bpf_bypass_spec_v1();
env->bypass_spec_v4 = bpf_bypass_spec_v4();
env->bpf_capable = bpf_capable();
+ env->rcu_tag_supported = btf_vmlinux &&
+ btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0;
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
@@ -15275,7 +17745,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (ret < 0)
goto skip_full_check;
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
@@ -15288,7 +17758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
ret = do_check_subprogs(env);
ret = ret ?: do_check_main(env);
- if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
+ if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
skip_full_check:
@@ -15323,7 +17793,7 @@ skip_full_check:
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
- if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
+ if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;
diff --git a/kernel/capability.c b/kernel/capability.c
index 765194f5d678..339a44dfe2f4 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -486,11 +486,11 @@ EXPORT_SYMBOL(file_ns_capable);
* Return true if the inode uid and gid are within the namespace.
*/
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
- struct user_namespace *mnt_userns,
+ struct mnt_idmap *idmap,
const struct inode *inode)
{
- return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
- kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
+ return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
+ vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}
/**
@@ -502,13 +502,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
-bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
return ns_capable(ns, cap) &&
- privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
+ privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2319946715e0..935e8121b21e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -248,6 +248,12 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+#ifdef CONFIG_DEBUG_CGROUP_REF
+#define CGROUP_REF_FN_ATTRS noinline
+#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn);
+#include <linux/cgroup_refcnt.h>
+#endif
+
/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
* @ssid: subsys ID of interest
@@ -2860,14 +2866,12 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
* take an rcu_read_lock.
*/
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
return cgroup_migrate_execute(mgctx);
@@ -5061,7 +5065,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
if (!inode)
return -ENOMEM;
- ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
+ ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
iput(inode);
return ret;
}
@@ -5349,6 +5353,7 @@ static void css_free_rwork_fn(struct work_struct *work)
atomic_dec(&cgrp->root->nr_cgrps);
cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
+ bpf_cgrp_storage_free(cgrp);
if (cgroup_parent(cgrp)) {
/*
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b474289c15b8..636f1c682ac0 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -550,11 +550,15 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Call with callback_lock or cpuset_rwsem held. The check can be skipped
+ * if on default hierarchy.
*/
-static void cpuset_update_task_spread_flag(struct cpuset *cs,
+static void cpuset_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk)
{
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
if (is_spread_page(cs))
task_set_spread_page(tsk);
else
@@ -1201,12 +1205,13 @@ void rebuild_sched_domains(void)
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @new_cpus: the temp variable for the new effective_cpus mask
*
* Iterate through each task of @cs updating its cpus_allowed to the
* effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
-static void update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
struct css_task_iter it;
struct task_struct *task;
@@ -1220,7 +1225,10 @@ static void update_tasks_cpumask(struct cpuset *cs)
if (top_cs && (task->flags & PF_KTHREAD) &&
kthread_is_per_cpu(task))
continue;
- set_cpus_allowed_ptr(task, cs->effective_cpus);
+
+ cpumask_and(new_cpus, cs->effective_cpus,
+ task_cpu_possible_mask(task));
+ set_cpus_allowed_ptr(task, new_cpus);
}
css_task_iter_end(&it);
}
@@ -1263,7 +1271,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on);
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
- * @cpuset: The cpuset that requests change in partition root state
+ * @cs: The cpuset that requests change in partition root state
* @cmd: Partition root state change command
* @newmask: Optional new cpumask for partcmd_update
* @tmp: Temporary addmask and delmask
@@ -1342,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
* A parent can be left with no CPU as long as there is no
* task directly associated with the parent partition.
*/
- if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) &&
+ if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) &&
partition_is_populated(parent, cs))
return PERR_NOCPUS;
@@ -1505,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
spin_unlock_irq(&callback_lock);
if (adding || deleting)
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent, tmp->new_cpus);
/*
* Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
@@ -1657,7 +1665,7 @@ update_parent_subparts:
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- update_tasks_cpumask(cp);
+ update_tasks_cpumask(cp, tmp->new_cpus);
/*
* On legacy hierarchy, if the effective cpumask of any non-
@@ -2153,7 +2161,7 @@ static void update_tasks_flags(struct cpuset *cs)
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
- cpuset_update_task_spread_flag(cs, task);
+ cpuset_update_task_spread_flags(cs, task);
css_task_iter_end(&it);
}
@@ -2305,7 +2313,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
}
}
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent, tmpmask.new_cpus);
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmpmask);
@@ -2320,6 +2328,7 @@ out:
new_prs = -new_prs;
spin_lock_irq(&callback_lock);
cs->partition_root_state = new_prs;
+ WRITE_ONCE(cs->prs_err, err);
spin_unlock_irq(&callback_lock);
/*
* Update child cpusets, if present.
@@ -2509,12 +2518,28 @@ static void cpuset_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
+ bool cpus_updated, mems_updated;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
percpu_down_write(&cpuset_rwsem);
+ cpus_updated = !cpumask_equal(cs->effective_cpus,
+ oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ /*
+ * In the default hierarchy, enabling cpuset in the child cgroups
+ * will trigger a number of cpuset_attach() calls with no change
+ * in effective cpus and mems. In that case, we can optimize out
+ * by skipping the task iteration and update.
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !cpus_updated && !mems_updated) {
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ goto out;
+ }
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2530,14 +2555,19 @@ static void cpuset_attach(struct cgroup_taskset *tset)
WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flag(cs, task);
+ cpuset_update_task_spread_flags(cs, task);
}
/*
* Change mm for all threadgroup leaders. This is expensive and may
- * sleep and should be moved outside migration path proper.
+ * sleep and should be moved outside migration path proper. Skip it
+ * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
+ * not set.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
+ if (!is_memory_migrate(cs) && !mems_updated)
+ goto out;
+
cgroup_taskset_for_each_leader(leader, css, tset) {
struct mm_struct *mm = get_task_mm(leader);
@@ -2560,6 +2590,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
}
}
+out:
cs->old_mems_allowed = cpuset_attach_nodemask_to;
cs->attach_in_progress--;
@@ -3046,11 +3077,15 @@ static struct cftype dfl_files[] = {
};
-/*
- * cpuset_css_alloc - allocate a cpuset css
- * cgrp: control group that the new cpuset will be part of
+/**
+ * cpuset_css_alloc - Allocate a cpuset css
+ * @parent_css: Parent css of the control group that the new cpuset will be
+ * part of
+ * Return: cpuset css on success, -ENOMEM on failure.
+ *
+ * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
+ * top cpuset css otherwise.
*/
-
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -3251,8 +3286,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
int __init cpuset_init(void)
{
- BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
-
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
@@ -3317,7 +3350,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
* as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated && !nodes_empty(cs->mems_allowed))
update_tasks_nodemask(cs);
@@ -3354,7 +3387,7 @@ hotplug_update_tasks(struct cpuset *cs,
spin_unlock_irq(&callback_lock);
if (cpus_updated)
- update_tasks_cpumask(cs);
+ update_tasks_cpumask(cs, new_cpus);
if (mems_updated)
update_tasks_nodemask(cs);
}
@@ -3630,11 +3663,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block cpuset_track_online_nodes_nb = {
- .notifier_call = cpuset_track_online_nodes,
- .priority = 10, /* ??! */
-};
-
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -3652,7 +3680,7 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+ hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
@@ -3666,15 +3694,38 @@ void __init cpuset_init_smp(void)
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
* subset of cpu_online_mask, even if this means going outside the
- * tasks cpuset.
+ * tasks cpuset, except when the task is in the top cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
unsigned long flags;
+ struct cpuset *cs;
spin_lock_irqsave(&callback_lock, flags);
- guarantee_online_cpus(tsk, pmask);
+ rcu_read_lock();
+
+ cs = task_cs(tsk);
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(tsk, pmask);
+ /*
+ * Tasks in the top cpuset won't get update to their cpumasks
+ * when a hotplug online/offline event happens. So we include all
+ * offline cpus in the allowed cpu list.
+ */
+ if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+
+ /*
+ * We first exclude cpus allocated to partitions. If there is no
+ * allowable online cpu left, we fall back to all possible cpus.
+ */
+ cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus);
+ if (!cpumask_intersects(pmask, cpu_online_mask))
+ cpumask_copy(pmask, possible_mask);
+ }
+
+ rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -3854,8 +3905,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
}
/**
- * cpuset_mem_spread_node() - On which node to begin search for a file page
- * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ * cpuset_spread_node() - On which node to begin search for a page
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -3879,12 +3929,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* is passed an offline node, it will fall back to the local node.
* See kmem_cache_alloc_node().
*/
-
static int cpuset_spread_node(int *rotor)
{
return *rotor = next_node_in(*rotor, current->mems_allowed);
}
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ */
int cpuset_mem_spread_node(void)
{
if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
@@ -3894,6 +3946,9 @@ int cpuset_mem_spread_node(void)
return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}
+/**
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ */
int cpuset_slab_spread_node(void)
{
if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
@@ -3902,7 +3957,6 @@ int cpuset_slab_spread_node(void)
return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
}
-
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
/**
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 793ecff29038..831f1f472bb8 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -26,7 +26,7 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
* rstat_cpu->updated_children list. See the comment on top of
* cgroup_rstat_cpu definition for details.
*/
-void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
unsigned long flags;
@@ -231,7 +231,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
*
* This function may block.
*/
-void cgroup_rstat_flush(struct cgroup *cgrp)
+__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
might_sleep();
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 8a44b93da0f3..c2f9c912df1c 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -7,5 +7,6 @@ CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
# CONFIG_SLAB is not set
-# CONFIG_SLUB is not set
-CONFIG_SLOB=y
+# CONFIG_SLOB_DEPRECATED is not set
+CONFIG_SLUB=y
+CONFIG_SLUB_TINY=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 77978e372377..a09f1c19336a 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
* In this we case we don't care about any concurrency/ordering.
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
- atomic_set(&ct->state, state);
+ arch_atomic_set(&ct->state, state);
} else {
/*
* Even if context tracking is disabled on this CPU, because it's outside
@@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
/* Tracking for vtime only, no concurrent RCU EQS accounting */
- atomic_set(&ct->state, state);
+ arch_atomic_set(&ct->state, state);
} else {
/*
* Tracking for vtime and RCU EQS. Make sure we don't race
@@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state)
* RCU only requires RCU_DYNTICKS_IDX increments to be fully
* ordered.
*/
- atomic_add(state, &ct->state);
+ arch_atomic_add(state, &ct->state);
}
}
}
@@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state)
* In this we case we don't care about any concurrency/ordering.
*/
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
- atomic_set(&ct->state, CONTEXT_KERNEL);
+ arch_atomic_set(&ct->state, CONTEXT_KERNEL);
} else {
if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
/* Tracking for vtime only, no concurrent RCU EQS accounting */
- atomic_set(&ct->state, CONTEXT_KERNEL);
+ arch_atomic_set(&ct->state, CONTEXT_KERNEL);
} else {
/*
* Tracking for vtime and RCU EQS. Make sure we don't race
@@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state)
* RCU only requires RCU_DYNTICKS_IDX increments to be fully
* ordered.
*/
- atomic_sub(state, &ct->state);
+ arch_atomic_sub(state, &ct->state);
}
}
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index bbad5e375d3b..6c0a92ca6bb5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -663,21 +663,51 @@ static bool cpuhp_next_state(bool bringup,
return true;
}
-static int cpuhp_invoke_callback_range(bool bringup,
- unsigned int cpu,
- struct cpuhp_cpu_state *st,
- enum cpuhp_state target)
+static int __cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target,
+ bool nofail)
{
enum cpuhp_state state;
- int err = 0;
+ int ret = 0;
while (cpuhp_next_state(bringup, &state, st, target)) {
+ int err;
+
err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
- if (err)
+ if (!err)
+ continue;
+
+ if (nofail) {
+ pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
+ cpu, bringup ? "UP" : "DOWN",
+ cpuhp_get_step(st->state)->name,
+ st->state, err);
+ ret = -1;
+ } else {
+ ret = err;
break;
+ }
}
- return err;
+ return ret;
+}
+
+static inline int cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
+}
+
+static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
}
static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
@@ -999,7 +1029,6 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
- int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -1012,13 +1041,10 @@ static int take_cpu_down(void *_param)
*/
WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
- /* Invoke the former CPU_DYING callbacks */
- ret = cpuhp_invoke_callback_range(false, cpu, st, target);
-
/*
- * DYING must not fail!
+ * Invoke the former CPU_DYING callbacks. DYING must not fail!
*/
- WARN_ON_ONCE(ret);
+ cpuhp_invoke_callback_range_nofail(false, cpu, st, target);
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -1296,16 +1322,14 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
- int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
- ret = cpuhp_invoke_callback_range(true, cpu, st, target);
/*
* STARTING must not fail!
*/
- WARN_ON_ONCE(ret);
+ cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
}
/*
@@ -2326,8 +2350,10 @@ static ssize_t target_store(struct device *dev, struct device_attribute *attr,
if (st->state < target)
ret = cpu_up(dev->id, target);
- else
+ else if (st->state > target)
ret = cpu_down(dev->id, target);
+ else if (WARN_ON(st->target != target))
+ st->target = target;
out:
unlock_device_hotplug();
return ret ? ret : count;
@@ -2688,6 +2714,7 @@ void __init boot_cpu_hotplug_init(void)
cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
+ this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}
/*
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index ba4ba71facf9..b0f0d15085db 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -30,16 +30,9 @@ static int cpu_pm_notify(enum cpu_pm_event event)
{
int ret;
- /*
- * This introduces a RCU read critical section, which could be
- * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know
- * this.
- */
- ct_irq_enter_irqson();
rcu_read_lock();
ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
rcu_read_unlock();
- ct_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -49,11 +42,9 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
unsigned long flags;
int ret;
- ct_irq_enter_irqson();
raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
- ct_irq_exit_irqson();
return notifier_to_errno(ret);
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a0eb4d5cf557..87ef6096823f 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -383,6 +383,9 @@ void vmcoreinfo_append_str(const char *fmt, ...)
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
vmcoreinfo_size += r;
+
+ WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
+ "vmcoreinfo data exceeds allocated size, truncating");
}
/*
diff --git a/kernel/cred.c b/kernel/cred.c
index e10c15f51c1f..811ad654abd1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -701,9 +701,9 @@ void __init cred_init(void)
* override a task's own credentials so that work can be done on behalf of that
* task that requires a different subjective context.
*
- * @daemon is used to provide a base for the security record, but can be NULL.
- * If @daemon is supplied, then the security data will be derived from that;
- * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ * @daemon is used to provide a base cred, with the security data derived from
+ * that; if this is "&init_task", they'll be set to 0, no groups, full
+ * capabilities, and no keys.
*
* The caller may change these controls afterwards if desired.
*
@@ -714,17 +714,16 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
const struct cred *old;
struct cred *new;
+ if (WARN_ON_ONCE(!daemon))
+ return NULL;
+
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_kernel_cred() alloc %p", new);
- if (daemon)
- old = get_task_cred(daemon);
- else
- old = get_cred(&init_cred);
-
+ old = get_task_cred(daemon);
validate_creds(old);
*new = *old;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 33437d620644..68106e3791f6 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -498,6 +498,14 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
WARN_ON_ONCE(!dev->coherent_dma_mask);
+ /*
+ * DMA allocations can never be turned back into a page pointer, so
+ * requesting compound pages doesn't make sense (and can't even be
+ * supported at all by various backends).
+ */
+ if (WARN_ON_ONCE(flag & __GFP_COMP))
+ return NULL;
+
if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
return cpu_addr;
@@ -552,6 +560,8 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size,
return NULL;
if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
@@ -637,6 +647,8 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
if (ops && ops->alloc_noncontiguous)
sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 339a990554e7..a34c38bbe28f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -300,6 +300,37 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
return;
}
+static void *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
+{
+ size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
+ void *tlb;
+
+ /*
+ * By default allocate the bounce buffer memory from low memory, but
+ * allow to pick a location everywhere for hypervisors with guest
+ * memory encryption.
+ */
+ if (flags & SWIOTLB_ANY)
+ tlb = memblock_alloc(bytes, PAGE_SIZE);
+ else
+ tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+
+ if (!tlb) {
+ pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
+ __func__, bytes);
+ return NULL;
+ }
+
+ if (remap && remap(tlb, nslabs) < 0) {
+ memblock_free(tlb, PAGE_ALIGN(bytes));
+ pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
+ return NULL;
+ }
+
+ return tlb;
+}
+
/*
* Statically reserve bounce buffer space and initialize bounce buffer data
* structures for the software IO TLB used to implement the DMA API.
@@ -310,7 +341,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
struct io_tlb_mem *mem = &io_tlb_default_mem;
unsigned long nslabs;
size_t alloc_size;
- size_t bytes;
void *tlb;
if (!addressing_limit && !swiotlb_force_bounce)
@@ -326,31 +356,16 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
swiotlb_adjust_nareas(num_possible_cpus());
nslabs = default_nslabs;
- /*
- * By default allocate the bounce buffer memory from low memory, but
- * allow to pick a location everywhere for hypervisors with guest
- * memory encryption.
- */
-retry:
- bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
- if (flags & SWIOTLB_ANY)
- tlb = memblock_alloc(bytes, PAGE_SIZE);
- else
- tlb = memblock_alloc_low(bytes, PAGE_SIZE);
- if (!tlb) {
- pr_warn("%s: failed to allocate tlb structure\n", __func__);
- return;
- }
-
- if (remap && remap(tlb, nslabs) < 0) {
- memblock_free(tlb, PAGE_ALIGN(bytes));
-
+ while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) {
+ if (nslabs <= IO_TLB_MIN_SLABS)
+ return;
nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
- if (nslabs >= IO_TLB_MIN_SLABS)
- goto retry;
+ }
- pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
- return;
+ if (default_nslabs != nslabs) {
+ pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs",
+ default_nslabs, nslabs);
+ default_nslabs = nslabs;
}
alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7f04f995c975..7099c77bc53b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -155,12 +155,6 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -184,6 +178,14 @@ static bool is_kernel_event(struct perf_event *event)
return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+struct perf_event_context *perf_cpu_task_ctx(void)
+{
+ lockdep_assert_irqs_disabled();
+ return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+}
+
/*
* On task ctx scheduling...
*
@@ -217,7 +219,7 @@ static int event_function(void *info)
struct event_function_struct *efs = info;
struct perf_event *event = efs->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
@@ -314,7 +316,7 @@ again:
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
@@ -378,7 +380,6 @@ enum event_type_t {
/*
* perf_sched_events : >0 events exist
- * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
static void perf_sched_delayed(struct work_struct *work);
@@ -387,8 +388,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
-static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -448,7 +447,7 @@ static void update_perf_cpu_limits(void)
WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
int perf_proc_update_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
@@ -571,12 +570,6 @@ void perf_sample_event_took(u64 sample_len_ns)
static atomic64_t perf_event_id;
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -691,13 +684,31 @@ do { \
___p; \
})
+static void perf_ctx_disable(struct perf_event_context *ctx)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ perf_pmu_disable(pmu_ctx->pmu);
+}
+
+static void perf_ctx_enable(struct perf_event_context *ctx)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ perf_pmu_enable(pmu_ctx->pmu);
+}
+
+static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
perf_cgroup_match(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
/* @event doesn't care about cgroup */
if (!event->cgrp)
@@ -823,54 +834,46 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
}
}
-static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-
/*
* reschedule events based on the cgroup constraint of task.
*/
static void perf_cgroup_switch(struct task_struct *task)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_cgroup *cgrp;
- struct perf_cpu_context *cpuctx, *tmp;
- struct list_head *list;
- unsigned long flags;
/*
- * Disable interrupts and preemption to avoid this CPU's
- * cgrp_cpuctx_entry to change under us.
+ * cpuctx->cgrp is set when the first cgroup event enabled,
+ * and is cleared when the last cgroup event disabled.
*/
- local_irq_save(flags);
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
- cgrp = perf_cgroup_from_task(task, NULL);
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- if (READ_ONCE(cpuctx->cgrp) == cgrp)
- continue;
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ cgrp = perf_cgroup_from_task(task, NULL);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ return;
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to update_cgrp_time_from_cpuctx() in
- * ctx_sched_out()
- */
- cpuctx->cgrp = cgrp;
- /*
- * set cgrp before ctxsw in to allow
- * perf_cgroup_set_timestamp() in ctx_sched_in()
- * to not have to pass task around
- */
- cpu_ctx_sched_in(cpuctx, EVENT_ALL);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_disable(&cpuctx->ctx);
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
+ */
+ cpuctx->cgrp = cgrp;
+ /*
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
+ */
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
- local_irq_restore(flags);
+ perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -888,7 +891,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event,
heap_size++;
for_each_possible_cpu(cpu) {
- cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
if (heap_size <= cpuctx->heap_size)
continue;
@@ -972,8 +975,6 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
return;
cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
- list_add(&cpuctx->cgrp_cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
}
static inline void
@@ -994,7 +995,6 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
return;
cpuctx->cgrp = NULL;
- list_del(&cpuctx->cgrp_cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1069,34 +1069,30 @@ static void perf_cgroup_switch(struct task_struct *task)
*/
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
bool rotations;
lockdep_assert_irqs_disabled();
- cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
- rotations = perf_rotate_context(cpuctx);
+ cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+ rotations = perf_rotate_context(cpc);
- raw_spin_lock(&cpuctx->hrtimer_lock);
+ raw_spin_lock(&cpc->hrtimer_lock);
if (rotations)
- hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ hrtimer_forward_now(hr, cpc->hrtimer_interval);
else
- cpuctx->hrtimer_active = 0;
- raw_spin_unlock(&cpuctx->hrtimer_lock);
+ cpc->hrtimer_active = 0;
+ raw_spin_unlock(&cpc->hrtimer_lock);
return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
+ struct pmu *pmu = cpc->epc.pmu;
u64 interval;
- /* no multiplexing needed for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return;
-
/*
* check default is sane, if not set then force to
* default interval (1/tick)
@@ -1105,34 +1101,34 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
if (interval < 1)
interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- raw_spin_lock_init(&cpuctx->hrtimer_lock);
+ raw_spin_lock_init(&cpc->hrtimer_lock);
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
-static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
unsigned long flags;
- /* not for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return 0;
-
- raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
- if (!cpuctx->hrtimer_active) {
- cpuctx->hrtimer_active = 1;
- hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+ if (!cpc->hrtimer_active) {
+ cpc->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpc->hrtimer_interval);
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
- raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
+ raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
return 0;
}
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1147,32 +1143,9 @@ void perf_pmu_enable(struct pmu *pmu)
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, active_ctx_list);
-
-/*
- * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * perf_event_task_tick() are fully serialized because they're strictly cpu
- * affine and perf_event_ctx{activate,deactivate} are called with IRQs
- * disabled, while perf_event_task_tick is called from IRQ context.
- */
-static void perf_event_ctx_activate(struct perf_event_context *ctx)
+static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
-
- lockdep_assert_irqs_disabled();
-
- WARN_ON(!list_empty(&ctx->active_ctx_list));
-
- list_add(&ctx->active_ctx_list, head);
-}
-
-static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
-{
- lockdep_assert_irqs_disabled();
-
- WARN_ON(list_empty(&ctx->active_ctx_list));
-
- list_del_init(&ctx->active_ctx_list);
+ WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
}
static void get_ctx(struct perf_event_context *ctx)
@@ -1199,7 +1172,6 @@ static void free_ctx(struct rcu_head *head)
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
- free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
kfree(ctx);
}
@@ -1384,7 +1356,7 @@ static u64 primary_event_id(struct perf_event *event)
* the context could get moved to another task.
*/
static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
struct perf_event_context *ctx;
@@ -1400,7 +1372,7 @@ retry:
*/
local_irq_save(*flags);
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (ctx) {
/*
* If this context is a clone of another, it might
@@ -1413,7 +1385,7 @@ retry:
* can't get swapped on us any more.
*/
raw_spin_lock(&ctx->lock);
- if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+ if (ctx != rcu_dereference(task->perf_event_ctxp)) {
raw_spin_unlock(&ctx->lock);
rcu_read_unlock();
local_irq_restore(*flags);
@@ -1440,12 +1412,12 @@ retry:
* reference count so that the context can't get freed.
*/
static struct perf_event_context *
-perf_pin_task_context(struct task_struct *task, int ctxn)
+perf_pin_task_context(struct task_struct *task)
{
struct perf_event_context *ctx;
unsigned long flags;
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1593,14 +1565,22 @@ static inline struct cgroup *event_cgroup(const struct perf_event *event)
* which provides ordering when rotating groups for the same CPU.
*/
static __always_inline int
-perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
- const u64 left_group_index, const struct perf_event *right)
+perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+ const struct cgroup *left_cgroup, const u64 left_group_index,
+ const struct perf_event *right)
{
if (left_cpu < right->cpu)
return -1;
if (left_cpu > right->cpu)
return 1;
+ if (left_pmu) {
+ if (left_pmu < right->pmu_ctx->pmu)
+ return -1;
+ if (left_pmu > right->pmu_ctx->pmu)
+ return 1;
+ }
+
#ifdef CONFIG_CGROUP_PERF
{
const struct cgroup *right_cgroup = event_cgroup(right);
@@ -1643,12 +1623,13 @@ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
{
struct perf_event *e = __node_2_pe(a);
- return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
- __node_2_pe(b)) < 0;
+ return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+ e->group_index, __node_2_pe(b)) < 0;
}
struct __group_key {
int cpu;
+ struct pmu *pmu;
struct cgroup *cgroup;
};
@@ -1657,14 +1638,25 @@ static inline int __group_cmp(const void *key, const struct rb_node *node)
const struct __group_key *a = key;
const struct perf_event *b = __node_2_pe(node);
- /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
- return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
+ /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+}
+
+static inline int
+__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+ b->group_index, b);
}
/*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
- * key (see perf_event_groups_less). This places it last inside the CPU
- * subtree.
+ * Insert @event into @groups' tree; using
+ * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+ * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
*/
static void
perf_event_groups_insert(struct perf_event_groups *groups,
@@ -1714,14 +1706,15 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
}
/*
- * Get the leftmost event in the cpu/cgroup subtree.
+ * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
*/
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
- struct cgroup *cgrp)
+ struct pmu *pmu, struct cgroup *cgrp)
{
struct __group_key key = {
.cpu = cpu,
+ .pmu = pmu,
.cgroup = cgrp,
};
struct rb_node *node;
@@ -1733,14 +1726,12 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
return NULL;
}
-/*
- * Like rb_entry_next_safe() for the @cpu subtree.
- */
static struct perf_event *
-perf_event_groups_next(struct perf_event *event)
+perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
struct __group_key key = {
.cpu = event->cpu,
+ .pmu = pmu,
.cgroup = event_cgroup(event),
};
struct rb_node *next;
@@ -1752,6 +1743,10 @@ perf_event_groups_next(struct perf_event *event)
return NULL;
}
+#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \
+ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \
+ event; event = perf_event_groups_next(event, pmu))
+
/*
* Iterate through the whole groups tree.
*/
@@ -1796,6 +1791,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
perf_cgroup_event_enable(event, ctx);
ctx->generation++;
+ event->pmu_ctx->nr_events++;
}
/*
@@ -1941,7 +1937,8 @@ static void perf_group_attach(struct perf_event *event)
lockdep_assert_held(&event->ctx->lock);
/*
- * We can have double attach due to group movement in perf_event_open.
+ * We can have double attach due to group movement (move_group) in
+ * perf_event_open().
*/
if (event->attach_state & PERF_ATTACH_GROUP)
return;
@@ -2006,6 +2003,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
}
ctx->generation++;
+ event->pmu_ctx->nr_events--;
}
static int
@@ -2022,13 +2020,11 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx);
static void perf_put_aux_event(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *iter;
/*
@@ -2057,7 +2053,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, cpuctx, ctx);
+ event_sched_out(iter, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
}
@@ -2108,8 +2104,8 @@ static int perf_get_aux_event(struct perf_event *event,
static inline struct list_head *get_event_list(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+ return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+ &event->pmu_ctx->flexible_active;
}
/*
@@ -2120,10 +2116,7 @@ static inline struct list_head *get_event_list(struct perf_event *event)
*/
static inline void perf_remove_sibling_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, event->ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
@@ -2212,53 +2205,22 @@ static bool is_orphaned_event(struct perf_event *event)
return event->state == PERF_EVENT_STATE_DEAD;
}
-static inline int __pmu_filter_match(struct perf_event *event)
-{
- struct pmu *pmu = event->pmu;
- return pmu->filter_match ? pmu->filter_match(event) : 1;
-}
-
-/*
- * Check whether we should attempt to schedule an event group based on
- * PMU-specific filtering. An event group can consist of HW and SW events,
- * potentially with a SW leader, so we must check all the filters, to
- * determine whether a group is schedulable:
- */
-static inline int pmu_filter_match(struct perf_event *event)
-{
- struct perf_event *sibling;
- unsigned long flags;
- int ret = 1;
-
- if (!__pmu_filter_match(event))
- return 0;
-
- local_irq_save(flags);
- for_each_sibling_event(sibling, event) {
- if (!__pmu_filter_match(sibling)) {
- ret = 0;
- break;
- }
- }
- local_irq_restore(flags);
-
- return ret;
-}
-
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
- perf_cgroup_match(event) && pmu_filter_match(event);
+ perf_cgroup_match(event);
}
static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+ // XXX cpc serialization, probably per-cpu IRQ disabled
+
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -2301,38 +2263,32 @@ event_sched_out(struct perf_event *event,
perf_event_set_state(event, state);
if (!is_software_event(event))
- cpuctx->active_oncpu--;
- if (!--ctx->nr_active)
- perf_event_ctx_deactivate(ctx);
+ cpc->active_oncpu--;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq--;
- if (event->attr.exclusive || !cpuctx->active_oncpu)
- cpuctx->exclusive = 0;
+ if (event->attr.exclusive || !cpc->active_oncpu)
+ cpc->exclusive = 0;
perf_pmu_enable(event->pmu);
}
static void
-group_sched_out(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event;
if (group_event->state != PERF_EVENT_STATE_ACTIVE)
return;
- perf_pmu_disable(ctx->pmu);
+ perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
/*
* Schedule out siblings (if any):
*/
for_each_sibling_event(event, group_event)
- event_sched_out(event, cpuctx, ctx);
-
- perf_pmu_enable(ctx->pmu);
+ event_sched_out(event, ctx);
}
#define DETACH_GROUP 0x01UL
@@ -2351,6 +2307,7 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_context *ctx,
void *info)
{
+ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
if (ctx->is_active & EVENT_TIME) {
@@ -2364,7 +2321,7 @@ __perf_remove_from_context(struct perf_event *event,
*/
if (flags & DETACH_DEAD)
event->pending_disable = 1;
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
@@ -2373,12 +2330,23 @@ __perf_remove_from_context(struct perf_event *event,
if (flags & DETACH_DEAD)
event->state = PERF_EVENT_STATE_DEAD;
+ if (!pmu_ctx->nr_events) {
+ pmu_ctx->rotate_necessary = 0;
+
+ if (ctx->task && ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+ }
+
if (!ctx->nr_events && ctx->is_active) {
if (ctx == &cpuctx->ctx)
update_cgrp_time_from_cpuctx(cpuctx, true);
ctx->is_active = 0;
- ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
@@ -2408,12 +2376,8 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
* event_function_call() user.
*/
raw_spin_lock_irq(&ctx->lock);
- /*
- * Cgroup events are per-cpu events, and must IPI because of
- * cgrp_cpuctx_list.
- */
- if (!ctx->is_active && !is_cgroup_event(event)) {
- __perf_remove_from_context(event, __get_cpu_context(ctx),
+ if (!ctx->is_active) {
+ __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
return;
@@ -2439,13 +2403,17 @@ static void __perf_event_disable(struct perf_event *event,
update_cgrp_time_from_event(event);
}
+ perf_pmu_disable(event->pmu_ctx->pmu);
+
if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
+ group_sched_out(event, ctx);
else
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
perf_cgroup_event_disable(event, ctx);
+
+ perf_pmu_enable(event->pmu_ctx->pmu);
}
/*
@@ -2507,10 +2475,10 @@ static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
static int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
int ret = 0;
WARN_ON_ONCE(event->ctx != ctx);
@@ -2551,14 +2519,12 @@ event_sched_in(struct perf_event *event,
}
if (!is_software_event(event))
- cpuctx->active_oncpu++;
- if (!ctx->nr_active++)
- perf_event_ctx_activate(ctx);
+ cpc->active_oncpu++;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq++;
if (event->attr.exclusive)
- cpuctx->exclusive = 1;
+ cpc->exclusive = 1;
out:
perf_pmu_enable(event->pmu);
@@ -2567,26 +2533,24 @@ out:
}
static int
-group_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = ctx->pmu;
+ struct pmu *pmu = group_event->pmu_ctx->pmu;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
- if (event_sched_in(group_event, cpuctx, ctx))
+ if (event_sched_in(group_event, ctx))
goto error;
/*
* Schedule in siblings as one group (if any):
*/
for_each_sibling_event(event, group_event) {
- if (event_sched_in(event, cpuctx, ctx)) {
+ if (event_sched_in(event, ctx)) {
partial_group = event;
goto group_error;
}
@@ -2605,9 +2569,9 @@ group_error:
if (event == partial_group)
break;
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
}
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
error:
pmu->cancel_txn(pmu);
@@ -2617,10 +2581,11 @@ error:
/*
* Work out whether we can put this event group on the CPU now.
*/
-static int group_can_go_on(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- int can_add_hw)
+static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+
/*
* Groups consisting entirely of software events can always go on.
*/
@@ -2630,7 +2595,7 @@ static int group_can_go_on(struct perf_event *event,
* If an exclusive group is already on, no other hardware
* events can go on.
*/
- if (cpuctx->exclusive)
+ if (cpc->exclusive)
return 0;
/*
* If this group is exclusive and there are already
@@ -2652,36 +2617,29 @@ static void add_event_to_ctx(struct perf_event *event,
perf_group_attach(event);
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- enum event_type_t event_type)
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
if (!cpuctx->task_ctx)
return;
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, event_type);
+ ctx_sched_out(ctx, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, EVENT_FLEXIBLE);
}
/*
@@ -2699,11 +2657,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
+/*
+ * XXX: ctx_resched() reschedule entire perf_event_context while adding new
+ * event to the context or enabling existing event in the context. We can
+ * probably optimize it by rescheduling only affected pmu_ctx.
+ */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
enum event_type_t event_type)
{
- enum event_type_t ctx_event_type;
bool cpu_event = !!(event_type & EVENT_CPU);
/*
@@ -2713,11 +2675,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
if (event_type & EVENT_PINNED)
event_type |= EVENT_FLEXIBLE;
- ctx_event_type = event_type & EVENT_ALL;
+ event_type &= EVENT_ALL;
- perf_pmu_disable(cpuctx->ctx.pmu);
- if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx, event_type);
+ perf_ctx_disable(&cpuctx->ctx);
+ if (task_ctx) {
+ perf_ctx_disable(task_ctx);
+ task_ctx_sched_out(task_ctx, event_type);
+ }
/*
* Decide which cpu ctx groups to schedule out based on the types
@@ -2727,17 +2691,20 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- cpu_ctx_sched_out(cpuctx, ctx_event_type);
- else if (ctx_event_type & EVENT_PINNED)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, event_type);
+ else if (event_type & EVENT_PINNED)
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, task_ctx);
- perf_pmu_enable(cpuctx->ctx.pmu);
+
+ perf_ctx_enable(&cpuctx->ctx);
+ if (task_ctx)
+ perf_ctx_enable(task_ctx);
}
void perf_pmu_resched(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
@@ -2755,7 +2722,7 @@ static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
bool reprogram = true;
int ret = 0;
@@ -2797,7 +2764,7 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
@@ -2830,7 +2797,7 @@ perf_install_in_context(struct perf_event_context *ctx,
WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
if (event->cpu != -1)
- event->cpu = cpu;
+ WARN_ON_ONCE(event->cpu != cpu);
/*
* Ensures that if we can observe event->ctx, both the event and ctx
@@ -2842,8 +2809,6 @@ perf_install_in_context(struct perf_event_context *ctx,
* perf_event_attr::disabled events will not run and can be initialized
* without IPI. Except when this is the first event for the context, in
* that case we need the magic of the IPI to set ctx->is_active.
- * Similarly, cgroup events for the context also needs the IPI to
- * manipulate the cgrp_cpuctx_list.
*
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
@@ -2945,7 +2910,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@@ -2954,7 +2919,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (!event_filter_match(event)) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -2963,7 +2928,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -3232,11 +3197,52 @@ out:
return err;
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
+ struct perf_event_context *ctx = pmu_ctx->ctx;
struct perf_event *event, *tmp;
+ struct pmu *pmu = pmu_ctx->pmu;
+
+ if (ctx->task && !ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+
+ if (!event_type)
+ return;
+
+ perf_pmu_disable(pmu);
+ if (event_type & EVENT_PINNED) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->pinned_active,
+ active_list)
+ group_sched_out(event, ctx);
+ }
+
+ if (event_type & EVENT_FLEXIBLE) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->flexible_active,
+ active_list)
+ group_sched_out(event, ctx);
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ pmu_ctx->rotate_necessary = 0;
+ }
+ perf_pmu_enable(pmu);
+}
+
+static void
+ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
lockdep_assert_held(&ctx->lock);
@@ -3284,27 +3290,8 @@ static void ctx_sched_out(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
- if (!ctx->nr_active || !(is_active & EVENT_ALL))
- return;
-
- perf_pmu_disable(ctx->pmu);
- if (is_active & EVENT_PINNED) {
- list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
- group_sched_out(event, cpuctx, ctx);
- }
-
- if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
- group_sched_out(event, cpuctx, ctx);
-
- /*
- * Since we cleared EVENT_FLEXIBLE, also clear
- * rotate_necessary, is will be reset by
- * ctx_flexible_sched_in() when needed.
- */
- ctx->rotate_necessary = 0;
- }
- perf_pmu_enable(ctx->pmu);
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ __pmu_ctx_sched_out(pmu_ctx, is_active);
}
/*
@@ -3409,26 +3396,68 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- struct task_struct *next)
+#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \
+ for (pos1 = list_first_entry(head1, typeof(*pos1), member), \
+ pos2 = list_first_entry(head2, typeof(*pos2), member); \
+ !list_entry_is_head(pos1, head1, member) && \
+ !list_entry_is_head(pos2, head2, member); \
+ pos1 = list_next_entry(pos1, member), \
+ pos2 = list_next_entry(pos2, member))
+
+static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
+ struct perf_event_context *next_ctx)
+{
+ struct perf_event_pmu_context *prev_epc, *next_epc;
+
+ if (!prev_ctx->nr_task_data)
+ return;
+
+ double_list_for_each_entry(prev_epc, next_epc,
+ &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
+ pmu_ctx_entry) {
+
+ if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
+ continue;
+
+ /*
+ * PMU specific parts of task perf context can require
+ * additional synchronization. As an example of such
+ * synchronization see implementation details of Intel
+ * LBR call stack data profiling;
+ */
+ if (prev_epc->pmu->swap_task_ctx)
+ prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
+ else
+ swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
+ }
+}
+
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
{
- struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+ struct perf_event_pmu_context *pmu_ctx;
+ struct perf_cpu_pmu_context *cpc;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+
+ if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+ pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+ }
+}
+
+static void
+perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
+{
+ struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent, *next_parent;
- struct perf_cpu_context *cpuctx;
int do_switch = 1;
- struct pmu *pmu;
if (likely(!ctx))
return;
- pmu = ctx->pmu;
- cpuctx = __get_cpu_context(ctx);
- if (!cpuctx->task_ctx)
- return;
-
rcu_read_lock();
- next_ctx = next->perf_event_ctxp[ctxn];
+ next_ctx = rcu_dereference(next->perf_event_ctxp);
if (!next_ctx)
goto unlock;
@@ -3453,7 +3482,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
/* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) ||
@@ -3470,21 +3499,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
-
- /*
- * PMU specific parts of task perf context can require
- * additional synchronization. As an example of such
- * synchronization see implementation details of Intel
- * LBR call stack data profiling;
- */
- if (pmu->swap_task_ctx)
- pmu->swap_task_ctx(ctx, next_ctx);
- else
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ perf_ctx_sched_task_cb(ctx, false);
+ perf_event_swap_task_ctx_data(ctx, next_ctx);
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3493,8 +3511,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
* since those values are always verified under
* ctx->lock which we're now holding.
*/
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
- RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
do_switch = 0;
@@ -3508,38 +3526,40 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
inside_switch:
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
- task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+ perf_ctx_sched_task_cb(ctx, false);
+ task_ctx_sched_out(ctx, EVENT_ALL);
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx);
raw_spin_unlock(&ctx->lock);
}
}
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
void perf_sched_cb_dec(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
this_cpu_dec(perf_sched_cb_usages);
+ barrier();
- if (!--cpuctx->sched_cb_usage)
- list_del(&cpuctx->sched_cb_entry);
+ if (!--cpc->sched_cb_usage)
+ list_del(&cpc->sched_cb_entry);
}
void perf_sched_cb_inc(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ if (!cpc->sched_cb_usage++)
+ list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ barrier();
this_cpu_inc(perf_sched_cb_usages);
}
@@ -3551,19 +3571,21 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+ pmu = cpc->epc.pmu;
+ /* software PMUs will not have sched_task */
if (WARN_ON_ONCE(!pmu->sched_task))
return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ pmu->sched_task(cpc->task_epc, sched_in);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3573,26 +3595,20 @@ static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
bool sched_in)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cpu_pmu_context *cpc;
- if (prev == next)
+ /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+ if (prev == next || cpuctx->task_ctx)
return;
- list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- /* will be handled in perf_event_context_sched_in/out */
- if (cpuctx->task_ctx)
- continue;
-
- __perf_pmu_sched_task(cpuctx, sched_in);
- }
+ list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+ __perf_pmu_sched_task(cpc, sched_in);
}
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
-#define for_each_task_context_nr(ctxn) \
- for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
-
/*
* Called from scheduler to remove the events of the current task,
* with interrupts disabled.
@@ -3607,33 +3623,20 @@ static void perf_event_switch(struct task_struct *task,
void __perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
- int ctxn;
-
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
- for_each_task_context_nr(ctxn)
- perf_event_context_sched_out(task, ctxn, next);
+ perf_event_context_sched_out(task, next);
/*
* if cgroup events exist on this CPU, then we need
* to check if we have to switch out PMU state.
* cgroup event are system-wide mode only
*/
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_switch(next);
-}
-
-/*
- * Called with IRQs disabled
- */
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
-{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+ perf_cgroup_switch(next);
}
static bool perf_less_group_idx(const void *l, const void *r)
@@ -3667,21 +3670,39 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event)
}
}
-static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+{
+ struct perf_cpu_pmu_context *cpc;
+
+ if (!pmu_ctx->ctx->task)
+ return;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = pmu_ctx;
+}
+
+static noinline int visit_groups_merge(struct perf_event_context *ctx,
struct perf_event_groups *groups, int cpu,
+ struct pmu *pmu,
int (*func)(struct perf_event *, void *),
void *data)
{
#ifdef CONFIG_CGROUP_PERF
struct cgroup_subsys_state *css = NULL;
#endif
+ struct perf_cpu_context *cpuctx = NULL;
/* Space for per CPU and/or any CPU event iterators. */
struct perf_event *itrs[2];
struct min_heap event_heap;
struct perf_event **evt;
int ret;
- if (cpuctx) {
+ if (pmu->filter && pmu->filter(pmu, cpu))
+ return 0;
+
+ if (!ctx->task) {
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
event_heap = (struct min_heap){
.data = cpuctx->heap,
.nr = 0,
@@ -3701,17 +3722,22 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
.size = ARRAY_SIZE(itrs),
};
/* Events not within a CPU context may be on any CPU. */
- __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
}
evt = event_heap.data;
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
#ifdef CONFIG_CGROUP_PERF
for (; css; css = css->parent)
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif
+ if (event_heap.nr) {
+ __link_epc((*evt)->pmu_ctx);
+ perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+ }
+
min_heapify_all(&event_heap, &perf_min_heap);
while (event_heap.nr) {
@@ -3719,7 +3745,7 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
if (ret)
return ret;
- *evt = perf_event_groups_next(*evt);
+ *evt = perf_event_groups_next(*evt, pmu);
if (*evt)
min_heapify(&event_heap, 0, &perf_min_heap);
else
@@ -3761,7 +3787,6 @@ static inline void group_update_userpage(struct perf_event *group_event)
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int *can_add_hw = data;
if (event->state <= PERF_EVENT_STATE_OFF)
@@ -3770,8 +3795,8 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, cpuctx, *can_add_hw)) {
- if (!group_sched_in(event, cpuctx, ctx))
+ if (group_can_go_on(event, *can_add_hw)) {
+ if (!group_sched_in(event, ctx))
list_add_tail(&event->active_list, get_event_list(event));
}
@@ -3781,8 +3806,11 @@ static int merge_sched_in(struct perf_event *event, void *data)
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
} else {
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
+ struct perf_cpu_pmu_context *cpc;
+
+ event->pmu_ctx->rotate_necessary = 1;
+ cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
+ perf_mux_hrtimer_restart(cpc);
group_update_userpage(event);
}
}
@@ -3790,39 +3818,53 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0;
}
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
{
+ struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
-
- visit_groups_merge(cpuctx, &ctx->pinned_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+ if (pmu) {
+ visit_groups_merge(ctx, &ctx->pinned_groups,
+ smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+ } else {
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ can_add_hw = 1;
+ visit_groups_merge(ctx, &ctx->pinned_groups,
+ smp_processor_id(), pmu_ctx->pmu,
+ merge_sched_in, &can_add_hw);
+ }
+ }
}
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
{
+ struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
+ if (pmu) {
+ visit_groups_merge(ctx, &ctx->flexible_groups,
+ smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+ } else {
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ can_add_hw = 1;
+ visit_groups_merge(ctx, &ctx->flexible_groups,
+ smp_processor_id(), pmu_ctx->pmu,
+ merge_sched_in, &can_add_hw);
+ }
+ }
+}
- visit_groups_merge(cpuctx, &ctx->flexible_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+{
+ ctx_flexible_sched_in(ctx, pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
lockdep_assert_held(&ctx->lock);
@@ -3856,39 +3898,32 @@ ctx_sched_in(struct perf_event_context *ctx,
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ ctx_pinned_sched_in(ctx, NULL);
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ ctx_flexible_sched_in(ctx, NULL);
}
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void perf_event_context_sched_in(struct task_struct *task)
{
- struct perf_event_context *ctx = &cpuctx->ctx;
-
- ctx_sched_in(ctx, cpuctx, event_type);
-}
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp);
+ if (!ctx)
+ goto rcu_unlock;
- cpuctx = __get_cpu_context(ctx);
+ if (cpuctx->task_ctx == ctx) {
+ perf_ctx_lock(cpuctx, ctx);
+ perf_ctx_disable(ctx);
- /*
- * HACK: for HETEROGENEOUS the task context might have switched to a
- * different PMU, force (re)set the context,
- */
- pmu = ctx->pmu = cpuctx->ctx.pmu;
+ perf_ctx_sched_task_cb(ctx, true);
- if (cpuctx->task_ctx == ctx) {
- if (cpuctx->sched_cb_usage)
- __perf_pmu_sched_task(cpuctx, true);
- return;
+ perf_ctx_enable(ctx);
+ perf_ctx_unlock(cpuctx, ctx);
+ goto rcu_unlock;
}
perf_ctx_lock(cpuctx, ctx);
@@ -3899,7 +3934,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!ctx->nr_events)
goto unlock;
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3908,17 +3943,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+ perf_ctx_disable(&cpuctx->ctx);
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ }
+
perf_event_sched_in(cpuctx, ctx);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(cpuctx->task_ctx, true);
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
- perf_pmu_enable(pmu);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
+ perf_ctx_enable(&cpuctx->ctx);
+
+ perf_ctx_enable(ctx);
unlock:
perf_ctx_unlock(cpuctx, ctx);
+rcu_unlock:
+ rcu_read_unlock();
}
/*
@@ -3935,16 +3977,7 @@ unlock:
void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
- struct perf_event_context *ctx;
- int ctxn;
-
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (likely(!ctx))
- continue;
-
- perf_event_context_sched_in(ctx, task);
- }
+ perf_event_context_sched_in(task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
@@ -4063,8 +4096,8 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
* events. At the same time, make sure, having freq events does not change
* the rate of unthrottling as that would introduce bias.
*/
-static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
- int needs_unthr)
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
struct perf_event *event;
struct hw_perf_event *hwc;
@@ -4076,16 +4109,16 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
* - context have events in frequency mode (needs freq adjust)
* - there are events to unthrottle on this cpu
*/
- if (!(ctx->nr_freq || needs_unthr))
+ if (!(ctx->nr_freq || unthrottle))
return;
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(ctx->pmu);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
+ // XXX use visit thingy to avoid the -1,cpu match
if (!event_filter_match(event))
continue;
@@ -4126,7 +4159,6 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_pmu_enable(event->pmu);
}
- perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
@@ -4148,72 +4180,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_event_to_rotate(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
struct perf_event *event;
+ struct rb_node *node;
+ struct rb_root *tree;
+ struct __group_key key = {
+ .pmu = pmu_ctx->pmu,
+ };
/* pick the first active flexible event */
- event = list_first_entry_or_null(&ctx->flexible_active,
+ event = list_first_entry_or_null(&pmu_ctx->flexible_active,
struct perf_event, active_list);
+ if (event)
+ goto out;
/* if no active flexible event, pick the first event */
- if (!event) {
- event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
- typeof(*event), group_node);
+ tree = &pmu_ctx->ctx->flexible_groups.tree;
+
+ if (!pmu_ctx->ctx->task) {
+ key.cpu = smp_processor_id();
+
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+ goto out;
+ }
+
+ key.cpu = -1;
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node) {
+ event = __node_2_pe(node);
+ goto out;
}
+ key.cpu = smp_processor_id();
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+
+out:
/*
* Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
* finds there are unschedulable events, it will set it again.
*/
- ctx->rotate_necessary = 0;
+ pmu_ctx->rotate_necessary = 0;
return event;
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
struct perf_event *cpu_event = NULL, *task_event = NULL;
- struct perf_event_context *task_ctx = NULL;
int cpu_rotate, task_rotate;
+ struct pmu *pmu;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- cpu_rotate = cpuctx->ctx.rotate_necessary;
- task_ctx = cpuctx->task_ctx;
- task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
+ cpu_epc = &cpc->epc;
+ pmu = cpu_epc->pmu;
+ task_epc = cpc->task_epc;
+
+ cpu_rotate = cpu_epc->rotate_necessary;
+ task_rotate = task_epc ? task_epc->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ perf_pmu_disable(pmu);
if (task_rotate)
- task_event = ctx_event_to_rotate(task_ctx);
+ task_event = ctx_event_to_rotate(task_epc);
if (cpu_rotate)
- cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(cpu_epc);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (task_ctx && cpu_event))
- ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
- if (cpu_event)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_epc && cpu_event)) {
+ update_context_time(task_epc->ctx);
+ __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+ }
- if (task_event)
- rotate_ctx(task_ctx, task_event);
- if (cpu_event)
+ if (cpu_event) {
+ update_context_time(&cpuctx->ctx);
+ __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
+ __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ }
- perf_event_sched_in(cpuctx, task_ctx);
+ if (task_event)
+ rotate_ctx(task_epc->ctx, task_event);
- perf_pmu_enable(cpuctx->ctx.pmu);
+ if (task_event || (task_epc && cpu_event))
+ __pmu_ctx_sched_in(task_epc->ctx, pmu);
+
+ perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
return true;
@@ -4221,8 +4290,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
void perf_event_task_tick(void)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
- struct perf_event_context *ctx, *tmp;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
int throttled;
lockdep_assert_irqs_disabled();
@@ -4231,8 +4300,13 @@ void perf_event_task_tick(void)
throttled = __this_cpu_xchg(perf_throttled_count, 0);
tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
- perf_adjust_freq_unthr_context(ctx, throttled);
+ perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+
+ rcu_read_lock();
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, !!throttled);
+ rcu_read_unlock();
}
static int event_enable_on_exec(struct perf_event *event,
@@ -4254,9 +4328,9 @@ static int event_enable_on_exec(struct perf_event *event,
* Enable all of a task's events that have been marked enable-on-exec.
* This expects task == current.
*/
-static void perf_event_enable_on_exec(int ctxn)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
enum event_type_t event_type = 0;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
@@ -4264,13 +4338,16 @@ static void perf_event_enable_on_exec(int ctxn)
int enabled = 0;
local_irq_save(flags);
- ctx = current->perf_event_ctxp[ctxn];
- if (!ctx || !ctx->nr_events)
+ if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
+ goto out;
+
+ if (!ctx->nr_events)
goto out;
- cpuctx = __get_cpu_context(ctx);
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
+
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
event_type |= get_event_type(event);
@@ -4283,7 +4360,7 @@ static void perf_event_enable_on_exec(int ctxn)
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4302,17 +4379,13 @@ static void perf_event_exit_event(struct perf_event *event,
* Removes all events from the current task that have been marked
* remove-on-exec, and feeds their values back to parent events.
*/
-static void perf_event_remove_on_exec(int ctxn)
+static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
struct perf_event *event, *next;
unsigned long flags;
bool modified = false;
- ctx = perf_pin_task_context(current, ctxn);
- if (!ctx)
- return;
-
mutex_lock(&ctx->mutex);
if (WARN_ON_ONCE(ctx->task != current))
@@ -4333,13 +4406,11 @@ static void perf_event_remove_on_exec(int ctxn)
raw_spin_lock_irqsave(&ctx->lock, flags);
if (modified)
clone_ctx = unclone_ctx(ctx);
- --ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
unlock:
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
if (clone_ctx)
put_ctx(clone_ctx);
}
@@ -4375,7 +4446,7 @@ static void __perf_event_read(void *info)
struct perf_read_data *data = info;
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu = event->pmu;
/*
@@ -4601,17 +4672,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
- INIT_LIST_HEAD(&ctx->active_ctx_list);
+ INIT_LIST_HEAD(&ctx->pmu_ctx_list);
perf_event_groups_init(&ctx->pinned_groups);
perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
- INIT_LIST_HEAD(&ctx->pinned_active);
- INIT_LIST_HEAD(&ctx->flexible_active);
refcount_set(&ctx->refcount, 1);
}
+static void
+__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+{
+ epc->pmu = pmu;
+ INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+ INIT_LIST_HEAD(&epc->pinned_active);
+ INIT_LIST_HEAD(&epc->flexible_active);
+ atomic_set(&epc->refcount, 1);
+}
+
static struct perf_event_context *
-alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+alloc_perf_context(struct task_struct *task)
{
struct perf_event_context *ctx;
@@ -4622,7 +4701,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
__perf_event_init_context(ctx);
if (task)
ctx->task = get_task_struct(task);
- ctx->pmu = pmu;
return ctx;
}
@@ -4651,15 +4729,12 @@ find_lively_task_by_vpid(pid_t vpid)
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task,
- struct perf_event *event)
+find_get_context(struct task_struct *task, struct perf_event *event)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
- void *task_ctx_data = NULL;
unsigned long flags;
- int ctxn, err;
- int cpu = event->cpu;
+ int err;
if (!task) {
/* Must be root to operate on a CPU event: */
@@ -4667,7 +4742,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (err)
return ERR_PTR(err);
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -4678,43 +4753,22 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
}
err = -EINVAL;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto errout;
-
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = alloc_task_ctx_data(pmu);
- if (!task_ctx_data) {
- err = -ENOMEM;
- goto errout;
- }
- }
-
retry:
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
- if (task_ctx_data && !ctx->task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
raw_spin_unlock_irqrestore(&ctx->lock, flags);
if (clone_ctx)
put_ctx(clone_ctx);
} else {
- ctx = alloc_perf_context(pmu, task);
+ ctx = alloc_perf_context(task);
err = -ENOMEM;
if (!ctx)
goto errout;
- if (task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
-
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -4723,12 +4777,12 @@ retry:
*/
if (task->flags & PF_EXITING)
err = -ESRCH;
- else if (task->perf_event_ctxp[ctxn])
+ else if (task->perf_event_ctxp)
err = -EAGAIN;
else {
get_ctx(ctx);
++ctx->pin_count;
- rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ rcu_assign_pointer(task->perf_event_ctxp, ctx);
}
mutex_unlock(&task->perf_event_mutex);
@@ -4741,21 +4795,141 @@ retry:
}
}
- free_task_ctx_data(pmu, task_ctx_data);
return ctx;
errout:
- free_task_ctx_data(pmu, task_ctx_data);
return ERR_PTR(err);
}
+static struct perf_event_pmu_context *
+find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+ struct perf_event *event)
+{
+ struct perf_event_pmu_context *new = NULL, *epc;
+ void *task_ctx_data = NULL;
+
+ if (!ctx->task) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ epc = &cpc->epc;
+ raw_spin_lock_irq(&ctx->lock);
+ if (!epc->ctx) {
+ atomic_set(&epc->refcount, 1);
+ epc->embedded = 1;
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+ } else {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+ return epc;
+ }
+
+ new = kzalloc(sizeof(*epc), GFP_KERNEL);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ task_ctx_data = alloc_task_ctx_data(pmu);
+ if (!task_ctx_data) {
+ kfree(new);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ __perf_init_event_pmu_context(new, pmu);
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because perf_event_init_task() doesn't actually hold the
+ * child_ctx->mutex.
+ */
+
+ raw_spin_lock_irq(&ctx->lock);
+ list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (epc->pmu == pmu) {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ goto found_epc;
+ }
+ }
+
+ epc = new;
+ new = NULL;
+
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+
+found_epc:
+ if (task_ctx_data && !epc->task_ctx_data) {
+ epc->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ ctx->nr_task_data++;
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+
+ free_task_ctx_data(pmu, task_ctx_data);
+ kfree(new);
+
+ return epc;
+}
+
+static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+}
+
+static void free_epc_rcu(struct rcu_head *head)
+{
+ struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+
+ kfree(epc->task_ctx_data);
+ kfree(epc);
+}
+
+static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ struct perf_event_context *ctx = epc->ctx;
+ unsigned long flags;
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
+ return;
+
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
+
+ WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+ WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (epc->embedded)
+ return;
+
+ call_rcu(&epc->rcu_head, free_epc_rcu);
+}
+
static void perf_event_free_filter(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
- struct perf_event *event;
+ struct perf_event *event = container_of(head, typeof(*event), rcu_head);
- event = container_of(head, struct perf_event, rcu_head);
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
@@ -4799,15 +4973,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event)
detach_sb_event(event);
}
-static void unaccount_event_cpu(struct perf_event *event, int cpu)
-{
- if (event->parent)
- return;
-
- if (is_cgroup_event(event))
- atomic_dec(&per_cpu(perf_cgroup_events, cpu));
-}
-
#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif
@@ -4873,8 +5038,6 @@ static void unaccount_event(struct perf_event *event)
schedule_delayed_work(&perf_sched_work, HZ);
}
- unaccount_event_cpu(event, event->cpu);
-
unaccount_pmu_sb_event(event);
}
@@ -4893,7 +5056,7 @@ static void perf_sched_delayed(struct work_struct *work)
*
* 1) cpu-wide events in the presence of per-task events,
* 2) per-task events in the presence of cpu-wide events,
- * 3) two matching events on the same context.
+ * 3) two matching events on the same perf_event_context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
* _free_event()), the latter -- before the first perf_install_in_context().
@@ -5017,6 +5180,9 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
+ if (event->pmu_ctx)
+ put_pmu_ctx(event->pmu_ctx);
+
/*
* perf_event_free_task() relies on put_ctx() being 'last', in particular
* all task references must be cleaned up.
@@ -5117,8 +5283,8 @@ int perf_event_release_kernel(struct perf_event *event)
LIST_HEAD(free_list);
/*
- * If we got here through err_file: fput(event_file); we will not have
- * attached to a context yet.
+ * If we got here through err_alloc: free_event(event); we will not
+ * have attached to a context yet.
*/
if (!ctx) {
WARN_ON_ONCE(event->attach_state &
@@ -5550,7 +5716,7 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
- perf_pmu_disable(ctx->pmu);
+ perf_pmu_disable(event->pmu);
/*
* We could be throttled; unthrottle now to avoid the tick
* trying to unthrottle while we already re-started the event.
@@ -5566,7 +5732,7 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
- perf_pmu_enable(ctx->pmu);
+ perf_pmu_enable(event->pmu);
}
}
@@ -6875,13 +7041,20 @@ out_put:
ring_buffer_put(rb);
}
-static void __perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
+/*
+ * A set of common sample data types saved even for non-sample records
+ * when event->attr.sample_id_all is set.
+ */
+#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
+
+static void __perf_event_header__init_id(struct perf_sample_data *data,
struct perf_event *event,
u64 sample_type)
{
data->type = event->attr.sample_type;
- header->size += event->id_header_size;
+ data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;
if (sample_type & PERF_SAMPLE_TID) {
/* namespace issues */
@@ -6908,8 +7081,10 @@ void perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
- if (event->attr.sample_id_all)
- __perf_event_header__init_id(header, data, event, event->attr.sample_type);
+ if (event->attr.sample_id_all) {
+ header->size += event->id_header_size;
+ __perf_event_header__init_id(data, event, event->attr.sample_type);
+ }
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
@@ -7139,7 +7314,7 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
+ if (data->br_stack) {
size_t size;
size = data->br_stack->nr
@@ -7315,7 +7490,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pud_leaf_size(pud);
pmdp = pmd_offset_lockless(pudp, pud, addr);
- pmd = READ_ONCE(*pmdp);
+ pmd = pmdp_get_lockless(pmdp);
if (!pmd_present(pmd))
return 0;
@@ -7383,83 +7558,68 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
return callchain ?: &__empty_callchain;
}
-void perf_prepare_sample(struct perf_event_header *header,
- struct perf_sample_data *data,
+static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
+{
+ return d * !!(flags & s);
+}
+
+void perf_prepare_sample(struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
u64 filtered_sample_type;
- header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header) + event->header_size;
-
- header->misc = 0;
- header->misc |= perf_misc_flags(regs);
-
/*
- * Clear the sample flags that have already been done by the
- * PMU driver.
+ * Add the sample flags that are dependent to others. And clear the
+ * sample flags that have already been done by the PMU driver.
*/
- filtered_sample_type = sample_type & ~data->sample_flags;
- __perf_event_header__init_id(header, data, event, filtered_sample_type);
-
- if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
- data->ip = perf_instruction_pointer(regs);
-
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- int size = 1;
+ filtered_sample_type = sample_type;
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
+ PERF_SAMPLE_IP);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
+ PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
+ PERF_SAMPLE_REGS_USER);
+ filtered_sample_type &= ~data->sample_flags;
- if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
- data->callchain = perf_callchain(event, regs);
-
- size += data->callchain->nr;
-
- header->size += size * sizeof(u64);
+ if (filtered_sample_type == 0) {
+ /* Make sure it has the correct data->type for output */
+ data->type = event->attr.sample_type;
+ return;
}
- if (sample_type & PERF_SAMPLE_RAW) {
- struct perf_raw_record *raw = data->raw;
- int size;
-
- if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) {
- struct perf_raw_frag *frag = &raw->frag;
- u32 sum = 0;
+ __perf_event_header__init_id(data, event, filtered_sample_type);
- do {
- sum += frag->size;
- if (perf_raw_frag_last(frag))
- break;
- frag = frag->next;
- } while (1);
+ if (filtered_sample_type & PERF_SAMPLE_IP) {
+ data->ip = perf_instruction_pointer(regs);
+ data->sample_flags |= PERF_SAMPLE_IP;
+ }
- size = round_up(sum + sizeof(u32), sizeof(u64));
- raw->size = size - sizeof(u32);
- frag->pad = raw->size - sum;
- } else {
- size = sizeof(u64);
- data->raw = NULL;
- }
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, regs);
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_RAW) {
+ data->raw = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_RAW;
}
- if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- int size = sizeof(u64); /* nr */
- if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
- if (branch_sample_hw_index(event))
- size += sizeof(u64);
-
- size += data->br_stack->nr
- * sizeof(struct perf_branch_entry);
- }
- header->size += size;
+ if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ data->br_stack = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
perf_sample_regs_user(&data->regs_user, regs);
- if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /*
+ * It cannot use the filtered_sample_type here as REGS_USER can be set
+ * by STACK_USER (using __cond_set() above) and we don't want to update
+ * the dyn_size if it's not requested by users.
+ */
+ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7468,10 +7628,11 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_USER;
}
- if (sample_type & PERF_SAMPLE_STACK_USER) {
+ if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
/*
* Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
@@ -7479,9 +7640,10 @@ void perf_prepare_sample(struct perf_event_header *header,
* up the rest of the sample size.
*/
u16 stack_size = event->attr.sample_stack_user;
+ u16 header_size = perf_sample_data_size(data, event);
u16 size = sizeof(u64);
- stack_size = perf_sample_ustack_size(stack_size, header->size,
+ stack_size = perf_sample_ustack_size(stack_size, header_size,
data->regs_user.regs);
/*
@@ -7493,24 +7655,31 @@ void perf_prepare_sample(struct perf_event_header *header,
size += sizeof(u64) + stack_size;
data->stack_user_size = stack_size;
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_STACK_USER;
}
- if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
data->weight.full = 0;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
- if (filtered_sample_type & PERF_SAMPLE_DATA_SRC)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
data->data_src.val = PERF_MEM_NA;
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
- if (filtered_sample_type & PERF_SAMPLE_TRANSACTION)
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
data->txn = 0;
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
- if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) {
- if (filtered_sample_type & PERF_SAMPLE_ADDR)
- data->addr = 0;
+ if (filtered_sample_type & PERF_SAMPLE_ADDR) {
+ data->addr = 0;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
}
- if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7522,20 +7691,23 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_INTR;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR &&
- filtered_sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
data->phys_addr = perf_virt_to_phys(data->addr);
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
#ifdef CONFIG_CGROUP_PERF
- if (sample_type & PERF_SAMPLE_CGROUP) {
+ if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
struct cgroup *cgrp;
/* protected by RCU */
cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
data->cgroup = cgroup_id(cgrp);
+ data->sample_flags |= PERF_SAMPLE_CGROUP;
}
#endif
@@ -7544,16 +7716,21 @@ void perf_prepare_sample(struct perf_event_header *header,
* require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
* but the value will not dump to the userspace.
*/
- if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
data->data_page_size = perf_get_page_size(data->addr);
+ data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
data->code_page_size = perf_get_page_size(data->ip);
+ data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_AUX) {
+ if (filtered_sample_type & PERF_SAMPLE_AUX) {
u64 size;
+ u16 header_size = perf_sample_data_size(data, event);
- header->size += sizeof(u64); /* size */
+ header_size += sizeof(u64); /* size */
/*
* Given the 16bit nature of header::size, an AUX sample can
@@ -7561,14 +7738,26 @@ void perf_prepare_sample(struct perf_event_header *header,
* Make sure this doesn't happen by using up to U16_MAX bytes
* per sample in total (rounded down to 8 byte boundary).
*/
- size = min_t(size_t, U16_MAX - header->size,
+ size = min_t(size_t, U16_MAX - header_size,
event->attr.aux_sample_size);
size = rounddown(size, 8);
size = perf_prepare_sample_aux(event, data, size);
- WARN_ON_ONCE(size + header->size > U16_MAX);
- header->size += size;
+ WARN_ON_ONCE(size + header_size > U16_MAX);
+ data->dyn_size += size + sizeof(u64); /* size above */
+ data->sample_flags |= PERF_SAMPLE_AUX;
}
+}
+
+void perf_prepare_header(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ header->type = PERF_RECORD_SAMPLE;
+ header->size = perf_sample_data_size(data, event);
+ header->misc = perf_misc_flags(regs);
+
/*
* If you're adding more sample types here, you likely need to do
* something about the overflowing header::size, like repurpose the
@@ -7596,7 +7785,8 @@ __perf_event_output(struct perf_event *event,
/* protect the callchain buffers */
rcu_read_lock();
- perf_prepare_sample(&header, data, event, regs);
+ perf_prepare_sample(data, event, regs);
+ perf_prepare_header(&header, data, event, regs);
err = output_begin(&handle, data, event, header.size);
if (err)
@@ -7729,7 +7919,6 @@ perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
struct perf_event_context *ctx;
- int ctxn;
rcu_read_lock();
preempt_disable();
@@ -7746,11 +7935,9 @@ perf_iterate_sb(perf_iterate_f output, void *data,
perf_iterate_sb_cpu(output, data);
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_iterate_ctx(ctx, output, data, false);
- }
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_iterate_ctx(ctx, output, data, false);
done:
preempt_enable();
rcu_read_unlock();
@@ -7792,20 +7979,17 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
void perf_event_exec(void)
{
struct perf_event_context *ctx;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- perf_event_enable_on_exec(ctxn);
- perf_event_remove_on_exec(ctxn);
+ ctx = perf_pin_task_context(current);
+ if (!ctx)
+ return;
- rcu_read_lock();
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx) {
- perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
- NULL, true);
- }
- rcu_read_unlock();
- }
+ perf_event_enable_on_exec(ctx);
+ perf_event_remove_on_exec(ctx);
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
+
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
}
struct remote_output {
@@ -7845,8 +8029,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
- struct pmu *pmu = event->ctx->pmu;
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct remote_output ro = {
.rb = event->rb,
};
@@ -8635,7 +8818,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
struct perf_event_context *ctx;
- int ctxn;
/*
* Data tracing isn't supported yet and as such there is no need
@@ -8645,13 +8827,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
return;
rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (!ctx)
- continue;
-
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
- }
rcu_read_unlock();
}
@@ -9826,6 +10004,44 @@ static struct pmu perf_swevent = {
#ifdef CONFIG_EVENT_TRACING
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
@@ -9875,6 +10091,44 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+static void __perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ struct trace_entry *entry = record;
+
+ if (event->attr.config != entry->type)
+ return;
+ /* Cannot deliver synchronous signal to other task. */
+ if (event->attr.sigtrap)
+ return;
+ if (perf_tp_event_match(event, data, regs))
+ perf_swevent_event(event, count, data, regs);
+}
+
+static void perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event_context *ctx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct pmu *pmu = &perf_tracepoint;
+ struct perf_event *event, *sibling;
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+}
+
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
@@ -9890,8 +10144,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
};
perf_sample_data_init(&data, 0, 0);
- data.raw = &raw;
- data.sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(&data, &raw);
perf_trace_buf_update(record, event_type);
@@ -9906,26 +10159,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
*/
if (task && task != current) {
struct perf_event_context *ctx;
- struct trace_entry *entry = record;
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (!ctx)
goto unlock;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->cpu != smp_processor_id())
- continue;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- /* Cannot deliver synchronous signal to other task. */
- if (event->attr.sigtrap)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
+ raw_spin_lock(&ctx->lock);
+ perf_tp_event_target_task(count, record, regs, &data, ctx);
+ raw_spin_unlock(&ctx->lock);
unlock:
rcu_read_unlock();
}
@@ -9934,44 +10176,6 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_tp_event);
-static void tp_perf_event_destroy(struct perf_event *event)
-{
- perf_trace_destroy(event);
-}
-
-static int perf_tp_event_init(struct perf_event *event)
-{
- int err;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -ENOENT;
-
- /*
- * no branch sampling for tracepoint events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- err = perf_trace_init(event);
- if (err)
- return err;
-
- event->destroy = tp_perf_event_destroy;
-
- return 0;
-}
-
-static struct pmu perf_tracepoint = {
- .task_ctx_nr = perf_sw_context,
-
- .event_init = perf_tp_event_init,
- .add = perf_trace_add,
- .del = perf_trace_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
* Flags in config, used by dynamic PMU kprobe and uprobe
@@ -10147,13 +10351,7 @@ static void bpf_overflow_handler(struct perf_event *event,
rcu_read_lock();
prog = READ_ONCE(event->prog);
if (prog) {
- if (prog->call_get_stack &&
- (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
- !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) {
- data->callchain = perf_callchain(event, regs);
- data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
- }
-
+ perf_prepare_sample(data, event, regs);
ret = bpf_prog_run(prog, &ctx);
}
rcu_read_unlock();
@@ -11058,36 +11256,9 @@ static int perf_event_idx_default(struct perf_event *event)
return 0;
}
-/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
- struct pmu *pmu;
-
- if (ctxn < 0)
- return NULL;
-
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->task_ctx_nr == ctxn)
- return pmu->pmu_cpu_context;
- }
-
- return NULL;
-}
-
static void free_pmu_context(struct pmu *pmu)
{
- /*
- * Static contexts such as perf_sw_context have a global lifetime
- * and may be shared between different PMUs. Avoid freeing them
- * when a single PMU is going away.
- */
- if (pmu->task_ctx_nr > perf_invalid_context)
- return;
-
- free_percpu(pmu->pmu_cpu_context);
+ free_percpu(pmu->cpu_pmu_context);
}
/*
@@ -11151,12 +11322,11 @@ perf_event_mux_interval_ms_store(struct device *dev,
/* update all cpuctx for this PMU */
cpus_read_lock();
for_each_online_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ struct perf_cpu_pmu_context *cpc;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- cpu_function_call(cpu,
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
}
cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
@@ -11193,13 +11363,15 @@ static int pmu_dev_alloc(struct pmu *pmu)
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
pmu->dev->release = pmu_dev_release;
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
ret = device_add(pmu->dev);
if (ret)
goto free_dev;
@@ -11267,47 +11439,19 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
}
skip_type:
- if (pmu->task_ctx_nr == perf_hw_context) {
- static int hw_context_taken = 0;
-
- /*
- * Other than systems with heterogeneous CPUs, it never makes
- * sense for two PMUs to share perf_hw_context. PMUs which are
- * uncore must use perf_invalid_context.
- */
- if (WARN_ON_ONCE(hw_context_taken &&
- !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
- pmu->task_ctx_nr = perf_invalid_context;
-
- hw_context_taken = 1;
- }
-
- pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
- if (pmu->pmu_cpu_context)
- goto got_cpu_context;
-
ret = -ENOMEM;
- pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
- if (!pmu->pmu_cpu_context)
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+ if (!pmu->cpu_pmu_context)
goto free_dev;
for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx);
- lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
- lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.pmu = pmu;
- cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+ struct perf_cpu_pmu_context *cpc;
- __perf_mux_hrtimer_init(cpuctx, cpu);
-
- cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
- cpuctx->heap = cpuctx->heap_default;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ __perf_init_event_pmu_context(&cpc->epc, pmu);
+ __perf_mux_hrtimer_init(cpc, cpu);
}
-got_cpu_context:
if (!pmu->start_txn) {
if (pmu->pmu_enable) {
/*
@@ -11540,15 +11684,6 @@ static void account_pmu_sb_event(struct perf_event *event)
attach_sb_event(event);
}
-static void account_event_cpu(struct perf_event *event, int cpu)
-{
- if (event->parent)
- return;
-
- if (is_cgroup_event(event))
- atomic_inc(&per_cpu(perf_cgroup_events, cpu));
-}
-
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
@@ -11636,8 +11771,6 @@ static void account_event(struct perf_event *event)
}
enabled:
- account_event_cpu(event, event->cpu);
-
account_pmu_sb_event(event);
}
@@ -11786,10 +11919,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
/*
- * Disallow uncore-cgroup events, they don't make sense as the cgroup will
- * be different on other CPUs in the uncore mask.
+ * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+ * events (they don't make sense as the cgroup will be different
+ * on other CPUs in the uncore mask).
*/
- if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
err = -EINVAL;
goto err_pmu;
}
@@ -12136,37 +12270,6 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
-/*
- * Variation on perf_event_ctx_lock_nested(), except we take two context
- * mutexes.
- */
-static struct perf_event_context *
-__perf_event_ctx_lock_double(struct perf_event *group_leader,
- struct perf_event_context *ctx)
-{
- struct perf_event_context *gctx;
-
-again:
- rcu_read_lock();
- gctx = READ_ONCE(group_leader->ctx);
- if (!refcount_inc_not_zero(&gctx->refcount)) {
- rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
-
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
- if (group_leader->ctx != gctx) {
- mutex_unlock(&ctx->mutex);
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- goto again;
- }
-
- return gctx;
-}
-
static bool
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
@@ -12212,9 +12315,10 @@ SYSCALL_DEFINE5(perf_event_open,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *gctx;
+ struct perf_event_context *ctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -12229,12 +12333,12 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
- /* Do we allow access to perf_event_open(2) ? */
- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
- err = perf_copy_attr(attr_uptr, &attr);
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
if (err)
return err;
@@ -12344,42 +12448,53 @@ SYSCALL_DEFINE5(perf_event_open,
if (pmu->task_ctx_nr == perf_sw_context)
event->event_caps |= PERF_EV_CAP_SOFTWARE;
- if (group_leader) {
- if (is_software_event(event) &&
- !in_software_context(group_leader)) {
- /*
- * If the event is a sw event, but the group_leader
- * is on hw context.
- *
- * Allow the addition of software events to hw
- * groups, this is safe because software events
- * never fail to schedule.
- */
- pmu = group_leader->ctx->pmu;
- } else if (!is_software_event(event) &&
- is_software_event(group_leader) &&
- (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
- /*
- * In case the group is a pure software group, and we
- * try to add a hardware event, move the whole group to
- * the hardware context.
- */
- move_group = 1;
- }
+ if (task) {
+ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto err_alloc;
+
+ /*
+ * We must hold exec_update_lock across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perf_check_permission(&attr, task))
+ goto err_cred;
}
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_alloc;
+ goto err_cred;
+ }
+
+ mutex_lock(&ctx->mutex);
+
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
}
- /*
- * Look up the group leader (we will attach this event to it):
- */
if (group_leader) {
err = -EINVAL;
@@ -12388,11 +12503,11 @@ SYSCALL_DEFINE5(perf_event_open,
* becoming part of another group-sibling):
*/
if (group_leader->group_leader != group_leader)
- goto err_context;
+ goto err_locked;
/* All events in a group should have the same clock */
if (group_leader->clock != event->clock)
- goto err_context;
+ goto err_locked;
/*
* Make sure we're both events for the same CPU;
@@ -12400,145 +12515,76 @@ SYSCALL_DEFINE5(perf_event_open,
* you can never concurrently schedule them anyhow.
*/
if (group_leader->cpu != event->cpu)
- goto err_context;
+ goto err_locked;
/*
- * Make sure we're both on the same task, or both
- * per-CPU events.
+ * Make sure we're both on the same context; either task or cpu.
*/
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
-
- /*
- * Do not allow to attach to a group in a different task
- * or CPU context. If we're moving SW events, we'll fix
- * this up later, so allow that.
- *
- * Racy, not holding group_leader->ctx->mutex, see comment with
- * perf_event_ctx_lock().
- */
- if (!move_group && group_leader->ctx != ctx)
- goto err_context;
+ if (group_leader->ctx != ctx)
+ goto err_locked;
/*
* Only a group leader can be exclusive or pinned
*/
if (attr.exclusive || attr.pinned)
- goto err_context;
- }
-
- if (output_event) {
- err = perf_event_set_output(event, output_event);
- if (err)
- goto err_context;
- }
-
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
- f_flags);
- if (IS_ERR(event_file)) {
- err = PTR_ERR(event_file);
- event_file = NULL;
- goto err_context;
- }
-
- if (task) {
- err = down_read_interruptible(&task->signal->exec_update_lock);
- if (err)
- goto err_file;
-
- /*
- * We must hold exec_update_lock across this and any potential
- * perf_install_in_context() call for this new event to
- * serialize against exec() altering our credentials (and the
- * perf_event_exit_task() that could imply).
- */
- err = -EACCES;
- if (!perf_check_permission(&attr, task))
- goto err_cred;
- }
-
- if (move_group) {
- gctx = __perf_event_ctx_lock_double(group_leader, ctx);
-
- if (gctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
goto err_locked;
- }
- /*
- * Check if we raced against another sys_perf_event_open() call
- * moving the software group underneath us.
- */
- if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ if (is_software_event(event) &&
+ !in_software_context(group_leader)) {
/*
- * If someone moved the group out from under us, check
- * if this new event wound up on the same ctx, if so
- * its the regular !move_group case, otherwise fail.
+ * If the event is a sw event, but the group_leader
+ * is on hw context.
+ *
+ * Allow the addition of software events to hw
+ * groups, this is safe because software events
+ * never fail to schedule.
+ *
+ * Note the comment that goes with struct
+ * perf_event_pmu_context.
*/
- if (gctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- } else {
- perf_event_ctx_unlock(group_leader, gctx);
- move_group = 0;
- goto not_move_group;
+ pmu = group_leader->pmu_ctx->pmu;
+ } else if (!is_software_event(event)) {
+ if (is_software_event(group_leader) &&
+ (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * In case the group is a pure software group, and we
+ * try to add a hardware event, move the whole group to
+ * the hardware context.
+ */
+ move_group = 1;
}
- }
- /*
- * Failure to create exclusive events returns -EBUSY.
- */
- err = -EBUSY;
- if (!exclusive_event_installable(group_leader, ctx))
- goto err_locked;
-
- for_each_sibling_event(sibling, group_leader) {
- if (!exclusive_event_installable(sibling, ctx))
+ /* Don't allow group of multiple hw events from different pmus */
+ if (!in_software_context(group_leader) &&
+ group_leader->pmu_ctx->pmu != pmu)
goto err_locked;
}
- } else {
- mutex_lock(&ctx->mutex);
-
- /*
- * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
- * see the group_leader && !move_group test earlier.
- */
- if (group_leader && group_leader->ctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- }
}
-not_move_group:
- if (ctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
+ /*
+ * Now that we're certain of the pmu; find the pmu_ctx.
+ */
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
goto err_locked;
}
+ event->pmu_ctx = pmu_ctx;
- if (!perf_event_validate_size(event)) {
- err = -E2BIG;
- goto err_locked;
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_context;
}
- if (!task) {
- /*
- * Check if the @cpu we're creating an event for is online.
- *
- * We use the perf_cpu_context::ctx::mutex to serialize against
- * the hotplug notifiers. See perf_event_{init,exit}_cpu().
- */
- struct perf_cpu_context *cpuctx =
- container_of(ctx, struct perf_cpu_context, ctx);
-
- if (!cpuctx->online) {
- err = -ENODEV;
- goto err_locked;
- }
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
+ goto err_context;
}
if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
err = -EINVAL;
- goto err_locked;
+ goto err_context;
}
/*
@@ -12547,36 +12593,33 @@ not_move_group:
*/
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_locked;
+ goto err_context;
}
WARN_ON_ONCE(ctx->parent_ctx);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
+ event_file = NULL;
+ goto err_context;
+ }
+
/*
* This is the point on no return; we cannot fail hereafter. This is
* where we start modifying current state.
*/
if (move_group) {
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
perf_remove_from_context(group_leader, 0);
- put_ctx(gctx);
+ put_pmu_ctx(group_leader->pmu_ctx);
for_each_sibling_event(sibling, group_leader) {
perf_remove_from_context(sibling, 0);
- put_ctx(gctx);
+ put_pmu_ctx(sibling->pmu_ctx);
}
/*
- * Wait for everybody to stop referencing the events through
- * the old lists, before installing it on new lists.
- */
- synchronize_rcu();
-
- /*
* Install the group siblings before the group leader.
*
* Because a group leader will try and install the entire group
@@ -12587,9 +12630,10 @@ not_move_group:
* reachable through the group lists.
*/
for_each_sibling_event(sibling, group_leader) {
+ sibling->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
- get_ctx(ctx);
}
/*
@@ -12597,9 +12641,10 @@ not_move_group:
* event. What we want here is event in the initial
* startup state, ready to be add into new context.
*/
+ group_leader->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
- get_ctx(ctx);
}
/*
@@ -12616,8 +12661,6 @@ not_move_group:
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -12639,25 +12682,18 @@ not_move_group:
fd_install(event_fd, event_file);
return event_fd;
+err_context:
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL; /* _free_event() */
err_locked:
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
err_cred:
if (task)
up_read(&task->signal->exec_update_lock);
-err_file:
- fput(event_file);
-err_context:
- perf_unpin_context(ctx);
- put_ctx(ctx);
err_alloc:
- /*
- * If event_file is set, the fput() above will have called ->release()
- * and that will take care of freeing the event.
- */
- if (!event_file)
- free_event(event);
+ free_event(event);
err_task:
if (task)
put_task_struct(task);
@@ -12683,8 +12719,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
perf_overflow_handler_t overflow_handler,
void *context)
{
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event_context *ctx;
struct perf_event *event;
+ struct pmu *pmu;
int err;
/*
@@ -12703,14 +12741,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
+ pmu = event->pmu;
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(event->pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_free;
+ goto err_alloc;
}
WARN_ON_ONCE(ctx->parent_ctx);
@@ -12720,6 +12762,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
+ goto err_unlock;
+ }
+ event->pmu_ctx = pmu_ctx;
+
if (!task) {
/*
* Check if the @cpu we're creating an event for is online.
@@ -12731,13 +12780,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
container_of(ctx, struct perf_cpu_context, ctx);
if (!cpuctx->online) {
err = -ENODEV;
- goto err_unlock;
+ goto err_pmu_ctx;
}
}
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_unlock;
+ goto err_pmu_ctx;
}
perf_install_in_context(ctx, event, event->cpu);
@@ -12746,44 +12795,59 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
return event;
+err_pmu_ctx:
+ put_pmu_ctx(pmu_ctx);
+ event->pmu_ctx = NULL; /* _free_event() */
err_unlock:
mutex_unlock(&ctx->mutex);
perf_unpin_context(ctx);
put_ctx(ctx);
-err_free:
+err_alloc:
free_event(event);
err:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+static void __perf_pmu_remove(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu,
+ struct perf_event_groups *groups,
+ struct list_head *events)
{
- struct perf_event_context *src_ctx;
- struct perf_event_context *dst_ctx;
- struct perf_event *event, *tmp;
- LIST_HEAD(events);
-
- src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
- dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+ struct perf_event *event, *sibling;
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
- mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
- event_entry) {
+ perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
perf_remove_from_context(event, 0);
- unaccount_event_cpu(event, src_cpu);
- put_ctx(src_ctx);
- list_add(&event->migrate_entry, &events);
+ put_pmu_ctx(event->pmu_ctx);
+ list_add(&event->migrate_entry, events);
+
+ for_each_sibling_event(sibling, event) {
+ perf_remove_from_context(sibling, 0);
+ put_pmu_ctx(sibling->pmu_ctx);
+ list_add(&sibling->migrate_entry, events);
+ }
}
+}
- /*
- * Wait for the events to quiesce before re-instating them.
- */
- synchronize_rcu();
+static void __perf_pmu_install_event(struct pmu *pmu,
+ struct perf_event_context *ctx,
+ int cpu, struct perf_event *event)
+{
+ struct perf_event_pmu_context *epc;
+
+ event->cpu = cpu;
+ epc = find_get_pmu_context(pmu, ctx, event);
+ event->pmu_ctx = epc;
+
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_install_in_context(ctx, event, cpu);
+}
+
+static void __perf_pmu_install(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu, struct list_head *events)
+{
+ struct perf_event *event, *tmp;
/*
* Re-instate events in 2 passes.
@@ -12793,30 +12857,48 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
* leader will enable its siblings, even if those are still on the old
* context.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
if (event->group_leader == event)
continue;
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
/*
* Once all the siblings are setup properly, install the group leaders
* to make it go.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
+}
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx, *dst_ctx;
+ LIST_HEAD(events);
+
+ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
+
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
}
@@ -12896,14 +12978,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+static void perf_event_exit_task_context(struct task_struct *child)
{
struct perf_event_context *child_ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
WARN_ON_ONCE(child != current);
- child_ctx = perf_pin_task_context(child, ctxn);
+ child_ctx = perf_pin_task_context(child);
if (!child_ctx)
return;
@@ -12925,13 +13007,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+ task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
* and mark the context dead.
*/
- RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+ RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
put_ctx(child_ctx); /* cannot be last */
WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
put_task_struct(current); /* cannot be last */
@@ -12966,7 +13048,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
void perf_event_exit_task(struct task_struct *child)
{
struct perf_event *event, *tmp;
- int ctxn;
mutex_lock(&child->perf_event_mutex);
list_for_each_entry_safe(event, tmp, &child->perf_event_list,
@@ -12982,8 +13063,7 @@ void perf_event_exit_task(struct task_struct *child)
}
mutex_unlock(&child->perf_event_mutex);
- for_each_task_context_nr(ctxn)
- perf_event_exit_task_context(child, ctxn);
+ perf_event_exit_task_context(child);
/*
* The perf_event_exit_task_context calls perf_event_task
@@ -13026,56 +13106,51 @@ void perf_event_free_task(struct task_struct *task)
{
struct perf_event_context *ctx;
struct perf_event *event, *tmp;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
+ ctx = rcu_access_pointer(task->perf_event_ctxp);
+ if (!ctx)
+ return;
- mutex_lock(&ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Destroy the task <-> ctx relation and mark the context dead.
- *
- * This is important because even though the task hasn't been
- * exposed yet the context has been (through child_list).
- */
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
- WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
- put_task_struct(task); /* cannot be last */
- raw_spin_unlock_irq(&ctx->lock);
+ mutex_lock(&ctx->mutex);
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Destroy the task <-> ctx relation and mark the context dead.
+ *
+ * This is important because even though the task hasn't been
+ * exposed yet the context has been (through child_list).
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+ raw_spin_unlock_irq(&ctx->lock);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
- perf_free_event(event, ctx);
- mutex_unlock(&ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
+ perf_free_event(event, ctx);
- /*
- * perf_event_release_kernel() could've stolen some of our
- * child events and still have them on its free_list. In that
- * case we must wait for these events to have been freed (in
- * particular all their references to this task must've been
- * dropped).
- *
- * Without this copy_process() will unconditionally free this
- * task (irrespective of its reference count) and
- * _free_event()'s put_task_struct(event->hw.target) will be a
- * use-after-free.
- *
- * Wait for all events to drop their context reference.
- */
- wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
- put_ctx(ctx); /* must be last */
- }
+ mutex_unlock(&ctx->mutex);
+
+ /*
+ * perf_event_release_kernel() could've stolen some of our
+ * child events and still have them on its free_list. In that
+ * case we must wait for these events to have been freed (in
+ * particular all their references to this task must've been
+ * dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+ put_ctx(ctx); /* must be last */
}
void perf_event_delayed_put(struct task_struct *task)
{
- int ctxn;
-
- for_each_task_context_nr(ctxn)
- WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+ WARN_ON_ONCE(task->perf_event_ctxp);
}
struct file *perf_event_get(unsigned int fd)
@@ -13125,6 +13200,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event_context *child_ctx)
{
enum perf_event_state parent_state = parent_event->state;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *child_event;
unsigned long flags;
@@ -13145,17 +13221,12 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
-
- if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
- !child_ctx->task_ctx_data) {
- struct pmu *pmu = child_event->pmu;
-
- child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
- if (!child_ctx->task_ctx_data) {
- free_event(child_event);
- return ERR_PTR(-ENOMEM);
- }
+ pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+ if (IS_ERR(pmu_ctx)) {
+ free_event(child_event);
+ return ERR_CAST(pmu_ctx);
}
+ child_event->pmu_ctx = pmu_ctx;
/*
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
@@ -13278,11 +13349,11 @@ static int inherit_group(struct perf_event *parent_event,
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
- struct task_struct *child, int ctxn,
+ struct task_struct *child,
u64 clone_flags, int *inherited_all)
{
- int ret;
struct perf_event_context *child_ctx;
+ int ret;
if (!event->attr.inherit ||
(event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
@@ -13292,7 +13363,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
return 0;
}
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -13300,16 +13371,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
- child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ child_ctx = alloc_perf_context(child);
if (!child_ctx)
return -ENOMEM;
- child->perf_event_ctxp[ctxn] = child_ctx;
+ child->perf_event_ctxp = child_ctx;
}
- ret = inherit_group(event, parent, parent_ctx,
- child, child_ctx);
-
+ ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
if (ret)
*inherited_all = 0;
@@ -13319,8 +13388,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-static int perf_event_init_context(struct task_struct *child, int ctxn,
- u64 clone_flags)
+static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -13330,14 +13398,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
unsigned long flags;
int ret = 0;
- if (likely(!parent->perf_event_ctxp[ctxn]))
+ if (likely(!parent->perf_event_ctxp))
return 0;
/*
* If the parent's context is a clone, pin it so it won't get
* swapped under us.
*/
- parent_ctx = perf_pin_task_context(parent, ctxn);
+ parent_ctx = perf_pin_task_context(parent);
if (!parent_ctx)
return 0;
@@ -13360,8 +13428,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
*/
perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13377,8 +13444,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13386,7 +13452,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (child_ctx && inherited_all) {
/*
@@ -13422,18 +13488,16 @@ out_unlock:
*/
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
- int ctxn, ret;
+ int ret;
- memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
- for_each_task_context_nr(ctxn) {
- ret = perf_event_init_context(child, ctxn, clone_flags);
- if (ret) {
- perf_event_free_task(child);
- return ret;
- }
+ ret = perf_event_init_context(child, clone_flags);
+ if (ret) {
+ perf_event_free_task(child);
+ return ret;
}
return 0;
@@ -13442,6 +13506,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags)
static void __init perf_event_init_all_cpus(void)
{
struct swevent_htable *swhash;
+ struct perf_cpu_context *cpuctx;
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
@@ -13449,15 +13514,19 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
-#ifdef CONFIG_CGROUP_PERF
- INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
-#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
}
@@ -13479,12 +13548,12 @@ static void perf_swevent_init_cpu(unsigned int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *ctx = __info;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
@@ -13494,18 +13563,16 @@ static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
+ // XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
- cpuctx->online = 0;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
+ mutex_unlock(&ctx->mutex);
cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
@@ -13519,20 +13586,17 @@ int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
cpumask_set_cpu(cpu, perf_online_mask);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- cpuctx->online = 1;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
mutex_unlock(&pmus_lock);
return 0;
@@ -13669,9 +13733,11 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
- rcu_read_lock();
+
+ preempt_disable();
perf_cgroup_switch(task);
- rcu_read_unlock();
+ preempt_enable();
+
return 0;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 35e0a31a0315..f2afdb0add7c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -67,11 +67,58 @@
#include <linux/io_uring.h>
#include <linux/kprobes.h>
#include <linux/rethook.h>
+#include <linux/sysfs.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/mmu_context.h>
+/*
+ * The default value should be high enough to not crash a system that randomly
+ * crashes its kernel from time to time, but low enough to at least not permit
+ * overflowing 32-bit refcounts or the ldsem writer count.
+ */
+static unsigned int oops_limit = 10000;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_exit_table[] = {
+ {
+ .procname = "oops_limit",
+ .data = &oops_limit,
+ .maxlen = sizeof(oops_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ { }
+};
+
+static __init int kernel_exit_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_exit_table);
+ return 0;
+}
+late_initcall(kernel_exit_sysctls_init);
+#endif
+
+static atomic_t oops_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
+}
+
+static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
+
+static __init int kernel_exit_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_exit_sysfs_init);
+#endif
+
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
@@ -760,6 +807,8 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
+ WARN_ON(irqs_disabled());
+
synchronize_group_exit(tsk, code);
WARN_ON(tsk->plug);
@@ -884,12 +933,18 @@ void __noreturn make_task_dead(int signr)
* Then do everything else.
*/
struct task_struct *tsk = current;
+ unsigned int limit;
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
+ if (unlikely(irqs_disabled())) {
+ pr_info("note: %s[%d] exited with irqs disabled\n",
+ current->comm, task_pid_nr(current));
+ local_irq_enable();
+ }
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
@@ -898,6 +953,20 @@ void __noreturn make_task_dead(int signr)
}
/*
+ * Every time the system oopses, if the oops happens while a reference
+ * to an object was held, the reference leaks.
+ * If the oops doesn't also leak memory, repeated oopsing can cause
+ * reference counters to wrap around (if they're not using refcount_t).
+ * This means that repeated oopsing can make unexploitable-looking bugs
+ * exploitable through repeated oopsing.
+ * To make sure this can't happen, place an upper bound on how often the
+ * kernel may oops without panic().
+ */
+ limit = READ_ONCE(oops_limit);
+ if (atomic_inc_return(&oops_count) >= limit && limit)
+ panic("Oopsed too often (kernel.oops_limit is %d)", limit);
+
+ /*
* We're taking recursive faults here in make_task_dead. Safest is to just
* leave this task alone and wait for reboot.
*/
@@ -1836,7 +1905,14 @@ bool thread_group_exited(struct pid *pid)
}
EXPORT_SYMBOL(thread_group_exited);
-__weak void abort(void)
+/*
+ * This needs to be __function_aligned as GCC implicitly makes any
+ * implementation of abort() cold and drops alignment specified by
+ * -falign-functions=N.
+ *
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
+ */
+__weak __function_aligned void abort(void)
{
BUG();
diff --git a/kernel/fork.c b/kernel/fork.c
index cfb09ca1b1bc..038b898dad52 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -75,7 +75,6 @@
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
-#include <linux/random.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
@@ -97,6 +96,7 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
+#include <linux/stackprotector.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -756,8 +756,13 @@ static void check_mm(struct mm_struct *mm)
"Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
+ long x = percpu_counter_sum(&mm->rss_stat[i]);
+ if (likely(!x))
+ continue;
+
+ /* Making sure this is not due to race with CPU offlining. */
+ x = percpu_counter_sum_all(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
mm, resident_page_types[i], x);
@@ -782,6 +787,8 @@ static void check_mm(struct mm_struct *mm)
*/
void __mmdrop(struct mm_struct *mm)
{
+ int i;
+
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
WARN_ON_ONCE(mm == current->active_mm);
@@ -791,6 +798,9 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ percpu_counter_destroy(&mm->rss_stat[i]);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1034,7 +1044,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
#ifdef CONFIG_BLK_CGROUP
- tsk->throttle_queue = NULL;
+ tsk->throttle_disk = NULL;
tsk->use_memdelay = 0;
#endif
@@ -1050,6 +1060,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->reported_split_lock = 0;
#endif
+#ifdef CONFIG_SCHED_MM_CID
+ tsk->mm_cid = -1;
+ tsk->mm_cid_active = 0;
+#endif
return tsk;
free_stack:
@@ -1110,6 +1124,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
+ int i;
+
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
@@ -1151,10 +1167,18 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
+ goto fail_pcpu;
+
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
+ mm_init_cid(mm);
return mm;
+fail_pcpu:
+ while (i > 0)
+ percpu_counter_destroy(&mm->rss_stat[--i]);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
@@ -1582,6 +1606,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
tsk->mm = mm;
tsk->active_mm = mm;
+ sched_mm_cid_fork(tsk);
return 0;
}
@@ -2588,11 +2613,6 @@ struct task_struct * __init fork_idle(int cpu)
return task;
}
-struct mm_struct *copy_init_mm(void)
-{
- return dup_mm(NULL, &init_mm);
-}
-
/*
* This is like kernel_clone(), but shaved down and tailored to just
* creating io_uring workers. It returns a created task, or an error pointer.
@@ -3011,10 +3031,27 @@ static void sighand_ctor(void *data)
init_waitqueue_head(&sighand->signalfd_wqh);
}
-void __init proc_caches_init(void)
+void __init mm_cache_init(void)
{
unsigned int mm_size;
+ /*
+ * The mm_cpumask is located at the end of mm_struct, and is
+ * dynamically sized based on the maximum CPU number this system
+ * can have, taking hotplug into account (nr_cpu_ids).
+ */
+ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
+
+ mm_cachep = kmem_cache_create_usercopy("mm_struct",
+ mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct mm_struct, saved_auxv),
+ sizeof_field(struct mm_struct, saved_auxv),
+ NULL);
+}
+
+void __init proc_caches_init(void)
+{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -3032,19 +3069,6 @@ void __init proc_caches_init(void)
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
- /*
- * The mm_cpumask is located at the end of mm_struct, and is
- * dynamically sized based on the maximum CPU number this system
- * can have, taking hotplug into account (nr_cpu_ids).
- */
- mm_size = sizeof(struct mm_struct) + cpumask_size();
-
- mm_cachep = kmem_cache_create_usercopy("mm_struct",
- mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
- offsetof(struct mm_struct, saved_auxv),
- sizeof_field(struct mm_struct, saved_auxv),
- NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index b22ef1efe751..514e4582b863 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -638,6 +638,7 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
bool pi, bool pending_op)
{
u32 uval, nval, mval;
+ pid_t owner;
int err;
/* Futex address must be 32bit aligned */
@@ -659,6 +660,10 @@ retry:
* 2. A woken up waiter is killed before it can acquire the
* futex in user space.
*
+ * In the second case, the wake up notification could be generated
+ * by the unlock path in user space after setting the futex value
+ * to zero or by the kernel after setting the OWNER_DIED bit below.
+ *
* In both cases the TID validation below prevents a wakeup of
* potential waiters which can cause these waiters to block
* forever.
@@ -667,24 +672,27 @@ retry:
*
* 1) task->robust_list->list_op_pending != NULL
* @pending_op == true
- * 2) User space futex value == 0
+ * 2) The owner part of user space futex value == 0
* 3) Regular futex: @pi == false
*
* If these conditions are met, it is safe to attempt waking up a
* potential waiter without touching the user space futex value and
- * trying to set the OWNER_DIED bit. The user space futex value is
- * uncontended and the rest of the user space mutex state is
- * consistent, so a woken waiter will just take over the
- * uncontended futex. Setting the OWNER_DIED bit would create
- * inconsistent state and malfunction of the user space owner died
- * handling.
+ * trying to set the OWNER_DIED bit. If the futex value is zero,
+ * the rest of the user space mutex state is consistent, so a woken
+ * waiter will just take over the uncontended futex. Setting the
+ * OWNER_DIED bit would create inconsistent state and malfunction
+ * of the user space owner died handling. Otherwise, the OWNER_DIED
+ * bit is already set, and the woken waiter is expected to deal with
+ * this.
*/
- if (pending_op && !pi && !uval) {
+ owner = uval & FUTEX_TID_MASK;
+
+ if (pending_op && !pi && !owner) {
futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
return 0;
}
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+ if (owner != task_pid_vnr(curr))
return 0;
/*
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 086a22d1adb7..a8074079b09e 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -286,19 +286,22 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
}
futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
- if (!futexv)
- return -ENOMEM;
+ if (!futexv) {
+ ret = -ENOMEM;
+ goto destroy_timer;
+ }
ret = futex_parse_waitv(futexv, waiters, nr_futexes);
if (!ret)
ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
+ kfree(futexv);
+
+destroy_timer:
if (timeout) {
hrtimer_cancel(&to.timer);
destroy_hrtimer_on_stack(&to.timer);
}
-
- kfree(futexv);
return ret;
}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 7971e989e425..74a4ef1da9ad 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -82,6 +82,7 @@ struct gcov_fn_info {
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: uniquifying time stamp
+ * @checksum: unique object checksum
* @filename: name of the associated gcov data file
* @merge: merge functions (null for unused counter type)
* @n_functions: number of instrumented functions
@@ -94,6 +95,10 @@ struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
+ /* Since GCC 12.1 a checksum field is added. */
+#if (__GNUC__ >= 12)
+ unsigned int checksum;
+#endif
const char *filename;
void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
unsigned int n_functions;
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 473036b43c83..81b97f0f6556 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -14,6 +14,8 @@ include/
arch/$SRCARCH/include/
"
+type cpio > /dev/null
+
# Support incremental builds by skipping archive generation
# if timestamps of files being archived are not changed.
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index db3d174c53d4..2531f3496ab6 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -86,15 +86,15 @@ config GENERIC_IRQ_IPI
depends on SMP
select IRQ_DOMAIN_HIERARCHY
-# Generic MSI interrupt support
-config GENERIC_MSI_IRQ
+# Generic IRQ IPI Mux support
+config GENERIC_IRQ_IPI_MUX
bool
+ depends on SMP
# Generic MSI hierarchical interrupt domain support
-config GENERIC_MSI_IRQ_DOMAIN
+config GENERIC_MSI_IRQ
bool
select IRQ_DOMAIN_HIERARCHY
- select GENERIC_MSI_IRQ
config IRQ_MSI_IOMMU
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index b4f53717d143..f19d3080bf11 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d9a5c1d65a79..44a4eba80315 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -7,398 +7,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
-
-static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- unsigned int cpus_per_vec)
-{
- const struct cpumask *siblmsk;
- int cpu, sibl;
-
- for ( ; cpus_per_vec > 0; ) {
- cpu = cpumask_first(nmsk);
-
- /* Should not happen, but I'm too lazy to think about it */
- if (cpu >= nr_cpu_ids)
- return;
-
- cpumask_clear_cpu(cpu, nmsk);
- cpumask_set_cpu(cpu, irqmsk);
- cpus_per_vec--;
-
- /* If the cpu has siblings, use them first */
- siblmsk = topology_sibling_cpumask(cpu);
- for (sibl = -1; cpus_per_vec > 0; ) {
- sibl = cpumask_next(sibl, siblmsk);
- if (sibl >= nr_cpu_ids)
- break;
- if (!cpumask_test_and_clear_cpu(sibl, nmsk))
- continue;
- cpumask_set_cpu(sibl, irqmsk);
- cpus_per_vec--;
- }
- }
-}
-
-static cpumask_var_t *alloc_node_to_cpumask(void)
-{
- cpumask_var_t *masks;
- int node;
-
- masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
- if (!masks)
- return NULL;
-
- for (node = 0; node < nr_node_ids; node++) {
- if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
- goto out_unwind;
- }
-
- return masks;
-
-out_unwind:
- while (--node >= 0)
- free_cpumask_var(masks[node]);
- kfree(masks);
- return NULL;
-}
-
-static void free_node_to_cpumask(cpumask_var_t *masks)
-{
- int node;
-
- for (node = 0; node < nr_node_ids; node++)
- free_cpumask_var(masks[node]);
- kfree(masks);
-}
-
-static void build_node_to_cpumask(cpumask_var_t *masks)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
-}
-
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
- const struct cpumask *mask, nodemask_t *nodemsk)
-{
- int n, nodes = 0;
-
- /* Calculate the number of nodes in the supplied affinity mask */
- for_each_node(n) {
- if (cpumask_intersects(mask, node_to_cpumask[n])) {
- node_set(n, *nodemsk);
- nodes++;
- }
- }
- return nodes;
-}
-
-struct node_vectors {
- unsigned id;
-
- union {
- unsigned nvectors;
- unsigned ncpus;
- };
-};
-
-static int ncpus_cmp_func(const void *l, const void *r)
-{
- const struct node_vectors *ln = l;
- const struct node_vectors *rn = r;
-
- return ln->ncpus - rn->ncpus;
-}
-
-/*
- * Allocate vector number for each node, so that for each node:
- *
- * 1) the allocated number is >= 1
- *
- * 2) the allocated numbver is <= active CPU number of this node
- *
- * The actual allocated total vectors may be less than @numvecs when
- * active total CPU number is less than @numvecs.
- *
- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
- * for each node.
- */
-static void alloc_nodes_vectors(unsigned int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- const nodemask_t nodemsk,
- struct cpumask *nmsk,
- struct node_vectors *node_vectors)
-{
- unsigned n, remaining_ncpus = 0;
-
- for (n = 0; n < nr_node_ids; n++) {
- node_vectors[n].id = n;
- node_vectors[n].ncpus = UINT_MAX;
- }
-
- for_each_node_mask(n, nodemsk) {
- unsigned ncpus;
-
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- ncpus = cpumask_weight(nmsk);
-
- if (!ncpus)
- continue;
- remaining_ncpus += ncpus;
- node_vectors[n].ncpus = ncpus;
- }
-
- numvecs = min_t(unsigned, remaining_ncpus, numvecs);
-
- sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
- ncpus_cmp_func, NULL);
-
- /*
- * Allocate vectors for each node according to the ratio of this
- * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
- * bigger than number of active numa nodes. Always start the
- * allocation from the node with minimized nr_cpus.
- *
- * This way guarantees that each active node gets allocated at
- * least one vector, and the theory is simple: over-allocation
- * is only done when this node is assigned by one vector, so
- * other nodes will be allocated >= 1 vector, since 'numvecs' is
- * bigger than number of numa nodes.
- *
- * One perfect invariant is that number of allocated vectors for
- * each node is <= CPU count of this node:
- *
- * 1) suppose there are two nodes: A and B
- * ncpu(X) is CPU count of node X
- * vecs(X) is the vector count allocated to node X via this
- * algorithm
- *
- * ncpu(A) <= ncpu(B)
- * ncpu(A) + ncpu(B) = N
- * vecs(A) + vecs(B) = V
- *
- * vecs(A) = max(1, round_down(V * ncpu(A) / N))
- * vecs(B) = V - vecs(A)
- *
- * both N and V are integer, and 2 <= V <= N, suppose
- * V = N - delta, and 0 <= delta <= N - 2
- *
- * 2) obviously vecs(A) <= ncpu(A) because:
- *
- * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
- * ncpu(A) >= 1
- *
- * otherwise,
- * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
- *
- * 3) prove how vecs(B) <= ncpu(B):
- *
- * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
- * over-allocated, so vecs(B) <= ncpu(B),
- *
- * otherwise:
- *
- * vecs(A) =
- * round_down(V * ncpu(A) / N) =
- * round_down((N - delta) * ncpu(A) / N) =
- * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
- * round_down((N * ncpu(A) - delta * N) / N) =
- * cpu(A) - delta
- *
- * then:
- *
- * vecs(A) - V >= ncpu(A) - delta - V
- * =>
- * V - vecs(A) <= V + delta - ncpu(A)
- * =>
- * vecs(B) <= N - ncpu(A)
- * =>
- * vecs(B) <= cpu(B)
- *
- * For nodes >= 3, it can be thought as one node and another big
- * node given that is exactly what this algorithm is implemented,
- * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
- * finally for each node X: vecs(X) <= ncpu(X).
- *
- */
- for (n = 0; n < nr_node_ids; n++) {
- unsigned nvectors, ncpus;
-
- if (node_vectors[n].ncpus == UINT_MAX)
- continue;
-
- WARN_ON_ONCE(numvecs == 0);
-
- ncpus = node_vectors[n].ncpus;
- nvectors = max_t(unsigned, 1,
- numvecs * ncpus / remaining_ncpus);
- WARN_ON_ONCE(nvectors > ncpus);
-
- node_vectors[n].nvectors = nvectors;
-
- remaining_ncpus -= ncpus;
- numvecs -= nvectors;
- }
-}
-
-static int __irq_build_affinity_masks(unsigned int startvec,
- unsigned int numvecs,
- unsigned int firstvec,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- struct cpumask *nmsk,
- struct irq_affinity_desc *masks)
-{
- unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
- unsigned int last_affv = firstvec + numvecs;
- unsigned int curvec = startvec;
- nodemask_t nodemsk = NODE_MASK_NONE;
- struct node_vectors *node_vectors;
-
- if (cpumask_empty(cpu_mask))
- return 0;
-
- nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
-
- /*
- * If the number of nodes in the mask is greater than or equal the
- * number of vectors we just spread the vectors across the nodes.
- */
- if (numvecs <= nodes) {
- for_each_node_mask(n, nodemsk) {
- /* Ensure that only CPUs which are in both masks are set */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
- if (++curvec == last_affv)
- curvec = firstvec;
- }
- return numvecs;
- }
-
- node_vectors = kcalloc(nr_node_ids,
- sizeof(struct node_vectors),
- GFP_KERNEL);
- if (!node_vectors)
- return -ENOMEM;
-
- /* allocate vector number for each node */
- alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
- nodemsk, nmsk, node_vectors);
-
- for (i = 0; i < nr_node_ids; i++) {
- unsigned int ncpus, v;
- struct node_vectors *nv = &node_vectors[i];
-
- if (nv->nvectors == UINT_MAX)
- continue;
-
- /* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
- ncpus = cpumask_weight(nmsk);
- if (!ncpus)
- continue;
-
- WARN_ON_ONCE(nv->nvectors > ncpus);
-
- /* Account for rounding errors */
- extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
-
- /* Spread allocated vectors on CPUs of the current node */
- for (v = 0; v < nv->nvectors; v++, curvec++) {
- cpus_per_vec = ncpus / nv->nvectors;
-
- /* Account for extra vectors to compensate rounding errors */
- if (extra_vecs) {
- cpus_per_vec++;
- --extra_vecs;
- }
-
- /*
- * wrapping has to be considered given 'startvec'
- * may start anywhere
- */
- if (curvec >= last_affv)
- curvec = firstvec;
- irq_spread_init_one(&masks[curvec].mask, nmsk,
- cpus_per_vec);
- }
- done += nv->nvectors;
- }
- kfree(node_vectors);
- return done;
-}
-
-/*
- * build affinity in two stages:
- * 1) spread present CPU on these vectors
- * 2) spread other possible CPUs on these vectors
- */
-static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
- unsigned int firstvec,
- struct irq_affinity_desc *masks)
-{
- unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
- cpumask_var_t *node_to_cpumask;
- cpumask_var_t nmsk, npresmsk;
- int ret = -ENOMEM;
-
- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return ret;
-
- if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
- goto fail_nmsk;
-
- node_to_cpumask = alloc_node_to_cpumask();
- if (!node_to_cpumask)
- goto fail_npresmsk;
-
- /* Stabilize the cpumasks */
- cpus_read_lock();
- build_node_to_cpumask(node_to_cpumask);
-
- /* Spread on present CPUs starting from affd->pre_vectors */
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, cpu_present_mask,
- nmsk, masks);
- if (ret < 0)
- goto fail_build_affinity;
- nr_present = ret;
-
- /*
- * Spread on non present CPUs starting from the next vector to be
- * handled. If the spreading of present CPUs already exhausted the
- * vector space, assign the non present CPUs to the already spread
- * out vectors.
- */
- if (nr_present >= numvecs)
- curvec = firstvec;
- else
- curvec = firstvec + nr_present;
- cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, npresmsk, nmsk,
- masks);
- if (ret >= 0)
- nr_others = ret;
-
- fail_build_affinity:
- cpus_read_unlock();
-
- if (ret >= 0)
- WARN_ON(nr_present + nr_others < numvecs);
-
- free_node_to_cpumask(node_to_cpumask);
-
- fail_npresmsk:
- free_cpumask_var(npresmsk);
-
- fail_nmsk:
- free_cpumask_var(nmsk);
- return ret < 0 ? ret : 0;
-}
+#include <linux/group_cpus.h>
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
{
@@ -461,14 +70,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
unsigned int this_vecs = affd->set_size[i];
- int ret;
+ int j;
+ struct cpumask *result = group_cpus_evenly(this_vecs);
- ret = irq_build_affinity_masks(curvec, this_vecs,
- curvec, masks);
- if (ret) {
+ if (!result) {
kfree(masks);
return NULL;
}
+
+ for (j = 0; j < this_vecs; j++)
+ cpumask_copy(&masks[curvec + j].mask, &result[j]);
+ kfree(result);
+
curvec += this_vecs;
usedvecs += this_vecs;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 8ac37e8e738a..49e7bc871fec 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1561,10 +1561,10 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
-static struct device *irq_get_parent_device(struct irq_data *data)
+static struct device *irq_get_pm_device(struct irq_data *data)
{
if (data->domain)
- return data->domain->dev;
+ return data->domain->pm_dev;
return NULL;
}
@@ -1578,7 +1578,7 @@ static struct device *irq_get_parent_device(struct irq_data *data)
*/
int irq_chip_pm_get(struct irq_data *data)
{
- struct device *dev = irq_get_parent_device(data);
+ struct device *dev = irq_get_pm_device(data);
int retval = 0;
if (IS_ENABLED(CONFIG_PM) && dev)
@@ -1597,7 +1597,7 @@ int irq_chip_pm_get(struct irq_data *data)
*/
int irq_chip_pm_put(struct irq_data *data)
{
- struct device *dev = irq_get_parent_device(data);
+ struct device *dev = irq_get_pm_device(data);
int retval = 0;
if (IS_ENABLED(CONFIG_PM) && dev)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index f09c60393e55..5fdc0b557579 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -52,6 +52,7 @@ enum {
* IRQS_PENDING - irq is pending and replayed later
* IRQS_SUSPENDED - irq is suspended
* IRQS_NMI - irq line is used to deliver NMIs
+ * IRQS_SYSFS - descriptor has been added to sysfs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -64,6 +65,7 @@ enum {
IRQS_SUSPENDED = 0x00000800,
IRQS_TIMINGS = 0x00001000,
IRQS_NMI = 0x00002000,
+ IRQS_SYSFS = 0x00004000,
};
#include "debug.h"
diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
new file mode 100644
index 000000000000..fa4fc18c6131
--- /dev/null
+++ b/kernel/irq/ipi-mux.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Multiplex several virtual IPIs over a single HW IPI.
+ *
+ * Copyright The Asahi Linux Contributors
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "ipi-mux: " fmt
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct ipi_mux_cpu {
+ atomic_t enable;
+ atomic_t bits;
+};
+
+static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
+static struct irq_domain *ipi_mux_domain;
+static void (*ipi_mux_send)(unsigned int cpu);
+
+static void ipi_mux_mask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+
+ atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
+}
+
+static void ipi_mux_unmask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+
+ atomic_or(ibit, &icpu->enable);
+
+ /*
+ * The atomic_or() above must complete before the atomic_read()
+ * below to avoid racing ipi_mux_send_mask().
+ */
+ smp_mb__after_atomic();
+
+ /* If a pending IPI was unmasked, raise a parent IPI immediately. */
+ if (atomic_read(&icpu->bits) & ibit)
+ ipi_mux_send(smp_processor_id());
+}
+
+static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+ unsigned long pending;
+ int cpu;
+
+ for_each_cpu(cpu, mask) {
+ icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
+
+ /*
+ * This sequence is the mirror of the one in ipi_mux_unmask();
+ * see the comment there. Additionally, release semantics
+ * ensure that the vIPI flag set is ordered after any shared
+ * memory accesses that precede it. This therefore also pairs
+ * with the atomic_fetch_andnot in ipi_mux_process().
+ */
+ pending = atomic_fetch_or_release(ibit, &icpu->bits);
+
+ /*
+ * The atomic_fetch_or_release() above must complete
+ * before the atomic_read() below to avoid racing with
+ * ipi_mux_unmask().
+ */
+ smp_mb__after_atomic();
+
+ /*
+ * The flag writes must complete before the physical IPI is
+ * issued to another CPU. This is implied by the control
+ * dependency on the result of atomic_read() below, which is
+ * itself already ordered after the vIPI flag write.
+ */
+ if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit))
+ ipi_mux_send(cpu);
+ }
+}
+
+static const struct irq_chip ipi_mux_chip = {
+ .name = "IPI Mux",
+ .irq_mask = ipi_mux_mask,
+ .irq_unmask = ipi_mux_unmask,
+ .ipi_send_mask = ipi_mux_send_mask,
+};
+
+static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_percpu_devid(virq + i);
+ irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL,
+ handle_percpu_devid_irq, NULL, NULL);
+ }
+
+ return 0;
+}
+
+static const struct irq_domain_ops ipi_mux_domain_ops = {
+ .alloc = ipi_mux_domain_alloc,
+ .free = irq_domain_free_irqs_top,
+};
+
+/**
+ * ipi_mux_process - Process multiplexed virtual IPIs
+ */
+void ipi_mux_process(void)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ irq_hw_number_t hwirq;
+ unsigned long ipis;
+ unsigned int en;
+
+ /*
+ * Reading enable mask does not need to be ordered as long as
+ * this function is called from interrupt handler because only
+ * the CPU itself can change it's own enable mask.
+ */
+ en = atomic_read(&icpu->enable);
+
+ /*
+ * Clear the IPIs we are about to handle. This pairs with the
+ * atomic_fetch_or_release() in ipi_mux_send_mask().
+ */
+ ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
+
+ for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int))
+ generic_handle_domain_irq(ipi_mux_domain, hwirq);
+}
+
+/**
+ * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
+ * parent IPI.
+ * @nr_ipi: number of virtual IPIs to create. This should
+ * be <= BITS_PER_TYPE(int)
+ * @mux_send: callback to trigger parent IPI for a particular CPU
+ *
+ * Returns first virq of the newly created virtual IPIs upon success
+ * or <=0 upon failure
+ */
+int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu))
+{
+ struct fwnode_handle *fwnode;
+ struct irq_domain *domain;
+ int rc;
+
+ if (ipi_mux_domain)
+ return -EEXIST;
+
+ if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)
+ return -EINVAL;
+
+ ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu));
+ if (!ipi_mux_pcpu)
+ return -ENOMEM;
+
+ fwnode = irq_domain_alloc_named_fwnode("IPI-Mux");
+ if (!fwnode) {
+ pr_err("unable to create IPI Mux fwnode\n");
+ rc = -ENOMEM;
+ goto fail_free_cpu;
+ }
+
+ domain = irq_domain_create_linear(fwnode, nr_ipi,
+ &ipi_mux_domain_ops, NULL);
+ if (!domain) {
+ pr_err("unable to add IPI Mux domain\n");
+ rc = -ENOMEM;
+ goto fail_free_fwnode;
+ }
+
+ domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+ irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI);
+
+ rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL);
+ if (rc <= 0) {
+ pr_err("unable to alloc IRQs from IPI Mux domain\n");
+ goto fail_free_domain;
+ }
+
+ ipi_mux_domain = domain;
+ ipi_mux_send = mux_send;
+
+ return rc;
+
+fail_free_domain:
+ irq_domain_remove(domain);
+fail_free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+fail_free_cpu:
+ free_percpu(ipi_mux_pcpu);
+ return rc;
+}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a91f9001103c..fd0996274401 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -288,22 +288,25 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
if (irq_kobj_base) {
/*
* Continue even in case of failure as this is nothing
- * crucial.
+ * crucial and failures in the late irq_sysfs_init()
+ * cannot be rolled back.
*/
if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
pr_warn("Failed to add kobject for irq %d\n", irq);
+ else
+ desc->istate |= IRQS_SYSFS;
}
}
static void irq_sysfs_del(struct irq_desc *desc)
{
/*
- * If irq_sysfs_init() has not yet been invoked (early boot), then
- * irq_kobj_base is NULL and the descriptor was never added.
- * kobject_del() complains about a object with no parent, so make
- * it conditional.
+ * Only invoke kobject_del() when kobject_add() was successfully
+ * invoked for the descriptor. This covers both early boot, where
+ * sysfs is not initialized yet, and the case of a failed
+ * kobject_add() invocation.
*/
- if (irq_kobj_base)
+ if (desc->istate & IRQS_SYSFS)
kobject_del(&desc->kobj);
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8fe1da9614ee..aa5b7eeeceb8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -25,6 +25,9 @@ static DEFINE_MUTEX(irq_domain_mutex);
static struct irq_domain *irq_default_domain;
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
struct irqchip_fwid {
@@ -114,7 +117,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
{
struct irqchip_fwid *fwid;
- if (WARN_ON(!is_fwnode_irqchip(fwnode)))
+ if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode)))
return;
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
@@ -123,23 +126,12 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
}
EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
-/**
- * __irq_domain_add() - Allocate a new irq_domain data structure
- * @fwnode: firmware node for the interrupt controller
- * @size: Size of linear map; 0 for radix mapping only
- * @hwirq_max: Maximum number of interrupts supported by controller
- * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
- * direct mapping
- * @ops: domain callbacks
- * @host_data: Controller private data pointer
- *
- * Allocates and initializes an irq_domain structure.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
- irq_hw_number_t hwirq_max, int direct_max,
- const struct irq_domain_ops *ops,
- void *host_data)
+static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
+ unsigned int size,
+ irq_hw_number_t hwirq_max,
+ int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
{
struct irqchip_fwid *fwid;
struct irq_domain *domain;
@@ -214,25 +206,66 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int s
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
- mutex_init(&domain->revmap_mutex);
domain->ops = ops;
domain->host_data = host_data;
domain->hwirq_max = hwirq_max;
- if (direct_max) {
+ if (direct_max)
domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
- }
domain->revmap_size = size;
+ /*
+ * Hierarchical domains use the domain lock of the root domain
+ * (innermost domain).
+ *
+ * For non-hierarchical domains (as for root domains), the root
+ * pointer is set to the domain itself so that &domain->root->mutex
+ * always points to the right lock.
+ */
+ mutex_init(&domain->mutex);
+ domain->root = domain;
+
irq_domain_check_hierarchy(domain);
+ return domain;
+}
+
+static void __irq_domain_publish(struct irq_domain *domain)
+{
mutex_lock(&irq_domain_mutex);
debugfs_add_domain_dir(domain);
list_add(&domain->link, &irq_domain_list);
mutex_unlock(&irq_domain_mutex);
pr_debug("Added domain %s\n", domain->name);
+}
+
+/**
+ * __irq_domain_add() - Allocate a new irq_domain data structure
+ * @fwnode: firmware node for the interrupt controller
+ * @size: Size of linear map; 0 for radix mapping only
+ * @hwirq_max: Maximum number of interrupts supported by controller
+ * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
+ * direct mapping
+ * @ops: domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Allocates and initializes an irq_domain structure.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
+ irq_hw_number_t hwirq_max, int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ domain = __irq_domain_create(fwnode, size, hwirq_max, direct_max,
+ ops, host_data);
+ if (domain)
+ __irq_domain_publish(domain);
+
return domain;
}
EXPORT_SYMBOL_GPL(__irq_domain_add);
@@ -502,30 +535,34 @@ static bool irq_domain_is_nomap(struct irq_domain *domain)
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
+ lockdep_assert_held(&domain->root->mutex);
+
if (irq_domain_is_nomap(domain))
return;
- mutex_lock(&domain->revmap_mutex);
if (hwirq < domain->revmap_size)
rcu_assign_pointer(domain->revmap[hwirq], NULL);
else
radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_set_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq,
struct irq_data *irq_data)
{
+ /*
+ * This also makes sure that all domains point to the same root when
+ * called from irq_domain_insert_irq() for each domain in a hierarchy.
+ */
+ lockdep_assert_held(&domain->root->mutex);
+
if (irq_domain_is_nomap(domain))
return;
- mutex_lock(&domain->revmap_mutex);
if (hwirq < domain->revmap_size)
rcu_assign_pointer(domain->revmap[hwirq], irq_data);
else
radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&domain->revmap_mutex);
}
static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
@@ -538,6 +575,9 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
return;
hwirq = irq_data->hwirq;
+
+ mutex_lock(&domain->root->mutex);
+
irq_set_status_flags(irq, IRQ_NOREQUEST);
/* remove chip and handler */
@@ -557,10 +597,12 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
/* Clear reverse map for this hwirq */
irq_domain_clear_mapping(domain, hwirq);
+
+ mutex_unlock(&domain->root->mutex);
}
-int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq)
+static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
{
struct irq_data *irq_data = irq_get_irq_data(virq);
int ret;
@@ -573,7 +615,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
irq_data->hwirq = hwirq;
irq_data->domain = domain;
if (domain->ops->map) {
@@ -590,23 +631,29 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
irq_data->domain = NULL;
irq_data->hwirq = 0;
- mutex_unlock(&irq_domain_mutex);
return ret;
}
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && irq_data->chip)
- domain->name = irq_data->chip->name;
}
domain->mapcount++;
irq_domain_set_mapping(domain, hwirq, irq_data);
- mutex_unlock(&irq_domain_mutex);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
return 0;
}
+
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ int ret;
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_associate_locked(domain, virq, hwirq);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(irq_domain_associate);
void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
@@ -619,9 +666,8 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
of_node_full_name(of_node), irq_base, (int)hwirq_base, count);
- for (i = 0; i < count; i++) {
+ for (i = 0; i < count; i++)
irq_domain_associate(domain, irq_base + i, hwirq_base + i);
- }
}
EXPORT_SYMBOL_GPL(irq_domain_associate_many);
@@ -668,6 +714,34 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
#endif
+static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
+{
+ struct device_node *of_node = irq_domain_get_of_node(domain);
+ int virq;
+
+ pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+
+ /* Allocate a virtual interrupt number */
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
+ affinity);
+ if (virq <= 0) {
+ pr_debug("-> virq allocation failed\n");
+ return 0;
+ }
+
+ if (irq_domain_associate_locked(domain, virq, hwirq)) {
+ irq_free_desc(virq);
+ return 0;
+ }
+
+ pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
+ hwirq, of_node_full_name(of_node), virq);
+
+ return virq;
+}
+
/**
* irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
* @domain: domain owning this hardware interrupt or NULL for default domain
@@ -680,14 +754,11 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
* on the number returned from that call.
*/
unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
- irq_hw_number_t hwirq,
- const struct irq_affinity_desc *affinity)
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
{
- struct device_node *of_node;
int virq;
- pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
-
/* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
@@ -695,32 +766,19 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
return 0;
}
- pr_debug("-> using domain @%p\n", domain);
- of_node = irq_domain_get_of_node(domain);
+ mutex_lock(&domain->root->mutex);
/* Check if mapping already exists */
virq = irq_find_mapping(domain, hwirq);
if (virq) {
- pr_debug("-> existing mapping on virq %d\n", virq);
- return virq;
- }
-
- /* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
- affinity);
- if (virq <= 0) {
- pr_debug("-> virq allocation failed\n");
- return 0;
- }
-
- if (irq_domain_associate(domain, virq, hwirq)) {
- irq_free_desc(virq);
- return 0;
+ pr_debug("existing mapping on virq %d\n", virq);
+ goto out;
}
- pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
- hwirq, of_node_full_name(of_node), virq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -789,6 +847,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
type &= IRQ_TYPE_SENSE_MASK;
+ mutex_lock(&domain->root->mutex);
+
/*
* If we've already configured this interrupt,
* don't do it again, or hell will break loose.
@@ -801,7 +861,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
* interrupt number.
*/
if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
- return virq;
+ goto out;
/*
* If the trigger type has not been set yet, then set
@@ -809,40 +869,45 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
*/
if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
irq_data = irq_get_irq_data(virq);
- if (!irq_data)
- return 0;
+ if (!irq_data) {
+ virq = 0;
+ goto out;
+ }
irqd_set_trigger_type(irq_data, type);
- return virq;
+ goto out;
}
pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
- return 0;
+ virq = 0;
+ goto out;
}
if (irq_domain_is_hierarchy(domain)) {
- virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
- if (virq <= 0)
- return 0;
+ virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE,
+ fwspec, false, NULL);
+ if (virq <= 0) {
+ virq = 0;
+ goto out;
+ }
} else {
/* Create mapping */
- virq = irq_create_mapping(domain, hwirq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL);
if (!virq)
- return virq;
+ goto out;
}
irq_data = irq_get_irq_data(virq);
- if (!irq_data) {
- if (irq_domain_is_hierarchy(domain))
- irq_domain_free_irqs(virq, 1);
- else
- irq_dispose_mapping(virq);
- return 0;
+ if (WARN_ON(!irq_data)) {
+ virq = 0;
+ goto out;
}
/* Store trigger type */
irqd_set_trigger_type(irq_data, type);
+out:
+ mutex_unlock(&domain->root->mutex);
return virq;
}
@@ -1102,12 +1167,16 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
struct irq_domain *domain;
if (size)
- domain = irq_domain_create_linear(fwnode, size, ops, host_data);
+ domain = __irq_domain_create(fwnode, size, size, 0, ops, host_data);
else
- domain = irq_domain_create_tree(fwnode, ops, host_data);
+ domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data);
+
if (domain) {
+ domain->root = parent->root;
domain->parent = parent;
domain->flags |= flags;
+
+ __irq_domain_publish(domain);
}
return domain;
@@ -1123,10 +1192,6 @@ static void irq_domain_insert_irq(int virq)
domain->mapcount++;
irq_domain_set_mapping(domain, data->hwirq, data);
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && data->chip)
- domain->name = data->chip->name;
}
irq_clear_status_flags(virq, IRQ_NOREQUEST);
@@ -1426,40 +1491,12 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
}
-/**
- * __irq_domain_alloc_irqs - Allocate IRQs from domain
- * @domain: domain to allocate from
- * @irq_base: allocate specified IRQ number if irq_base >= 0
- * @nr_irqs: number of IRQs to allocate
- * @node: NUMA node id for memory allocation
- * @arg: domain specific argument
- * @realloc: IRQ descriptors have already been allocated if true
- * @affinity: Optional irq affinity mask for multiqueue devices
- *
- * Allocate IRQ numbers and initialized all data structures to support
- * hierarchy IRQ domains.
- * Parameter @realloc is mainly to support legacy IRQs.
- * Returns error code or allocated IRQ number
- *
- * The whole process to setup an IRQ has been split into two steps.
- * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
- * descriptor and required hardware resources. The second step,
- * irq_domain_activate_irq(), is to program the hardware with preallocated
- * resources. In this way, it's easier to rollback when failing to
- * allocate resources.
- */
-int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
- unsigned int nr_irqs, int node, void *arg,
- bool realloc, const struct irq_affinity_desc *affinity)
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
{
int i, ret, virq;
- if (domain == NULL) {
- domain = irq_default_domain;
- if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
- return -EINVAL;
- }
-
if (realloc && irq_base >= 0) {
virq = irq_base;
} else {
@@ -1478,24 +1515,18 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
goto out_free_desc;
}
- mutex_lock(&irq_domain_mutex);
ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg);
- if (ret < 0) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret < 0)
goto out_free_irq_data;
- }
for (i = 0; i < nr_irqs; i++) {
ret = irq_domain_trim_hierarchy(virq + i);
- if (ret) {
- mutex_unlock(&irq_domain_mutex);
+ if (ret)
goto out_free_irq_data;
- }
}
-
+
for (i = 0; i < nr_irqs; i++)
irq_domain_insert_irq(virq + i);
- mutex_unlock(&irq_domain_mutex);
return virq;
@@ -1505,6 +1536,48 @@ out_free_desc:
irq_free_descs(virq, nr_irqs);
return ret;
}
+
+/**
+ * __irq_domain_alloc_irqs - Allocate IRQs from domain
+ * @domain: domain to allocate from
+ * @irq_base: allocate specified IRQ number if irq_base >= 0
+ * @nr_irqs: number of IRQs to allocate
+ * @node: NUMA node id for memory allocation
+ * @arg: domain specific argument
+ * @realloc: IRQ descriptors have already been allocated if true
+ * @affinity: Optional irq affinity mask for multiqueue devices
+ *
+ * Allocate IRQ numbers and initialized all data structures to support
+ * hierarchy IRQ domains.
+ * Parameter @realloc is mainly to support legacy IRQs.
+ * Returns error code or allocated IRQ number
+ *
+ * The whole process to setup an IRQ has been split into two steps.
+ * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
+ * descriptor and required hardware resources. The second step,
+ * irq_domain_activate_irq(), is to program the hardware with preallocated
+ * resources. In this way, it's easier to rollback when failing to
+ * allocate resources.
+ */
+int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ int ret;
+
+ if (domain == NULL) {
+ domain = irq_default_domain;
+ if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
+ return -EINVAL;
+ }
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg,
+ realloc, affinity);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
/* The irq_data was moved, fix the revmap to refer to the new location */
@@ -1512,11 +1585,12 @@ static void irq_domain_fix_revmap(struct irq_data *d)
{
void __rcu **slot;
+ lockdep_assert_held(&d->domain->root->mutex);
+
if (irq_domain_is_nomap(d->domain))
return;
/* Fix up the revmap. */
- mutex_lock(&d->domain->revmap_mutex);
if (d->hwirq < d->domain->revmap_size) {
/* Not using radix tree */
rcu_assign_pointer(d->domain->revmap[d->hwirq], d);
@@ -1525,7 +1599,6 @@ static void irq_domain_fix_revmap(struct irq_data *d)
if (slot)
radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
}
- mutex_unlock(&d->domain->revmap_mutex);
}
/**
@@ -1541,8 +1614,8 @@ static void irq_domain_fix_revmap(struct irq_data *d)
*/
int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
{
- struct irq_data *child_irq_data;
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_desc *desc;
int rv = 0;
@@ -1567,47 +1640,46 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
if (WARN_ON(!irq_domain_is_hierarchy(domain)))
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
- if (domain->parent != root_irq_data->domain)
+ if (domain->parent != irq_data->domain)
return -EINVAL;
- child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL,
- irq_data_get_node(root_irq_data));
- if (!child_irq_data)
+ parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL,
+ irq_data_get_node(irq_data));
+ if (!parent_irq_data)
return -ENOMEM;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
/* Copy the original irq_data. */
- *child_irq_data = *root_irq_data;
+ *parent_irq_data = *irq_data;
/*
- * Overwrite the root_irq_data, which is embedded in struct
- * irq_desc, with values for this domain.
+ * Overwrite the irq_data, which is embedded in struct irq_desc, with
+ * values for this domain.
*/
- root_irq_data->parent_data = child_irq_data;
- root_irq_data->domain = domain;
- root_irq_data->mask = 0;
- root_irq_data->hwirq = 0;
- root_irq_data->chip = NULL;
- root_irq_data->chip_data = NULL;
+ irq_data->parent_data = parent_irq_data;
+ irq_data->domain = domain;
+ irq_data->mask = 0;
+ irq_data->hwirq = 0;
+ irq_data->chip = NULL;
+ irq_data->chip_data = NULL;
/* May (probably does) set hwirq, chip, etc. */
rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (rv) {
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
- kfree(child_irq_data);
+ *irq_data = *parent_irq_data;
+ kfree(parent_irq_data);
goto error;
}
- irq_domain_fix_revmap(child_irq_data);
- irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data);
-
+ irq_domain_fix_revmap(parent_irq_data);
+ irq_domain_set_mapping(domain, irq_data->hwirq, irq_data);
error:
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
return rv;
}
@@ -1623,8 +1695,8 @@ EXPORT_SYMBOL_GPL(irq_domain_push_irq);
*/
int irq_domain_pop_irq(struct irq_domain *domain, int virq)
{
- struct irq_data *root_irq_data = irq_get_irq_data(virq);
- struct irq_data *child_irq_data;
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
struct irq_data *tmp_irq_data;
struct irq_desc *desc;
@@ -1646,37 +1718,37 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq)
if (domain == NULL)
return -EINVAL;
- if (!root_irq_data)
+ if (!irq_data)
return -EINVAL;
tmp_irq_data = irq_domain_get_irq_data(domain, virq);
/* We can only "pop" if this domain is at the top of the list */
- if (WARN_ON(root_irq_data != tmp_irq_data))
+ if (WARN_ON(irq_data != tmp_irq_data))
return -EINVAL;
- if (WARN_ON(root_irq_data->domain != domain))
+ if (WARN_ON(irq_data->domain != domain))
return -EINVAL;
- child_irq_data = root_irq_data->parent_data;
- if (WARN_ON(!child_irq_data))
+ parent_irq_data = irq_data->parent_data;
+ if (WARN_ON(!parent_irq_data))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
+ mutex_lock(&domain->root->mutex);
- root_irq_data->parent_data = NULL;
+ irq_data->parent_data = NULL;
- irq_domain_clear_mapping(domain, root_irq_data->hwirq);
+ irq_domain_clear_mapping(domain, irq_data->hwirq);
irq_domain_free_irqs_hierarchy(domain, virq, 1);
/* Restore the original irq_data. */
- *root_irq_data = *child_irq_data;
+ *irq_data = *parent_irq_data;
- irq_domain_fix_revmap(root_irq_data);
+ irq_domain_fix_revmap(irq_data);
- mutex_unlock(&irq_domain_mutex);
+ mutex_unlock(&domain->root->mutex);
- kfree(child_irq_data);
+ kfree(parent_irq_data);
return 0;
}
@@ -1690,17 +1762,20 @@ EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_domain *domain;
int i;
if (WARN(!data || !data->domain || !data->domain->ops->free,
"NULL pointer, cannot free irq\n"))
return;
- mutex_lock(&irq_domain_mutex);
+ domain = data->domain;
+
+ mutex_lock(&domain->root->mutex);
for (i = 0; i < nr_irqs; i++)
irq_domain_remove_irq(virq + i);
- irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs);
- mutex_unlock(&irq_domain_mutex);
+ irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs);
+ mutex_unlock(&domain->root->mutex);
irq_domain_free_irq_data(virq, nr_irqs);
irq_free_descs(virq, nr_irqs);
@@ -1865,6 +1940,13 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
irq_set_handler_data(virq, handler_data);
}
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ return -EINVAL;
+}
+
static void irq_domain_check_hierarchy(struct irq_domain *domain)
{
}
@@ -1915,7 +1997,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(debugfs_lookup(d->name, domain_dir));
+ debugfs_lookup_and_remove(d->name, domain_dir);
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 40fe7806cc8c..8ce75495e04f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -321,7 +321,7 @@ static int irq_try_set_affinity(struct irq_data *data,
}
static bool irq_set_affinity_deactivated(struct irq_data *data,
- const struct cpumask *mask, bool force)
+ const struct cpumask *mask)
{
struct irq_desc *desc = irq_data_to_desc(data);
@@ -354,7 +354,7 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- if (irq_set_affinity_deactivated(data, mask, force))
+ if (irq_set_affinity_deactivated(data, mask))
return 0;
if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
@@ -723,10 +723,13 @@ EXPORT_SYMBOL(disable_irq_nosync);
* to complete before returning. If you use this function while
* holding a resource the IRQ handler may need you will deadlock.
*
- * This function may be called - with care - from IRQ context.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
+ *
*/
void disable_irq(unsigned int irq)
{
+ might_sleep();
if (!__disable_irq_nosync(irq))
synchronize_irq(irq);
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index a9ee535293eb..783a3e6a0b10 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -19,8 +19,31 @@
#include "internals.h"
+/**
+ * struct msi_ctrl - MSI internal management control structure
+ * @domid: ID of the domain on which management operations should be done
+ * @first: First (hardware) slot index to operate on
+ * @last: Last (hardware) slot index to operate on
+ * @nirqs: The number of Linux interrupts to allocate. Can be larger
+ * than the range due to PCI/multi-MSI.
+ */
+struct msi_ctrl {
+ unsigned int domid;
+ unsigned int first;
+ unsigned int last;
+ unsigned int nirqs;
+};
+
+/* Invalid Xarray index which is outside of any searchable range */
+#define MSI_XA_MAX_INDEX (ULONG_MAX - 1)
+/* The maximum domain size */
+#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1)
+
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);
+
/**
* msi_alloc_desc - Allocate an initialized msi_desc
* @dev: Pointer to the device for which this is allocated
@@ -33,7 +56,7 @@ static inline int msi_sysfs_create_group(struct device *dev);
* Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
- const struct irq_affinity_desc *affinity)
+ const struct irq_affinity_desc *affinity)
{
struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
@@ -58,25 +81,56 @@ static void msi_free_desc(struct msi_desc *desc)
kfree(desc);
}
-static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index)
+static int msi_insert_desc(struct device *dev, struct msi_desc *desc,
+ unsigned int domid, unsigned int index)
{
+ struct msi_device_data *md = dev->msi.data;
+ struct xarray *xa = &md->__domains[domid].store;
+ unsigned int hwsize;
int ret;
- desc->msi_index = index;
- ret = xa_insert(&md->__store, index, desc, GFP_KERNEL);
- if (ret)
- msi_free_desc(desc);
+ hwsize = msi_domain_get_hwsize(dev, domid);
+
+ if (index == MSI_ANY_INDEX) {
+ struct xa_limit limit = { .min = 0, .max = hwsize - 1 };
+ unsigned int index;
+
+ /* Let the xarray allocate a free index within the limit */
+ ret = xa_alloc(xa, &index, desc, limit, GFP_KERNEL);
+ if (ret)
+ goto fail;
+
+ desc->msi_index = index;
+ return 0;
+ } else {
+ if (index >= hwsize) {
+ ret = -ERANGE;
+ goto fail;
+ }
+
+ desc->msi_index = index;
+ ret = xa_insert(xa, index, desc, GFP_KERNEL);
+ if (ret)
+ goto fail;
+ return 0;
+ }
+fail:
+ msi_free_desc(desc);
return ret;
}
/**
- * msi_add_msi_desc - Allocate and initialize a MSI descriptor
+ * msi_domain_insert_msi_desc - Allocate and initialize a MSI descriptor and
+ * insert it at @init_desc->msi_index
+ *
* @dev: Pointer to the device for which the descriptor is allocated
+ * @domid: The id of the interrupt domain to which the desriptor is added
* @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
*
* Return: 0 on success or an appropriate failure code.
*/
-int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
+int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
+ struct msi_desc *init_desc)
{
struct msi_desc *desc;
@@ -88,40 +142,8 @@ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
/* Copy type specific data to the new descriptor. */
desc->pci = init_desc->pci;
- return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index);
-}
-/**
- * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors
- * @dev: Pointer to the device for which the descriptors are allocated
- * @index: Index for the first MSI descriptor
- * @ndesc: Number of descriptors to allocate
- *
- * Return: 0 on success or an appropriate failure code.
- */
-static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc)
-{
- unsigned int idx, last = index + ndesc - 1;
- struct msi_desc *desc;
- int ret;
-
- lockdep_assert_held(&dev->msi.data->mutex);
-
- for (idx = index; idx <= last; idx++) {
- desc = msi_alloc_desc(dev, 1, NULL);
- if (!desc)
- goto fail_mem;
- ret = msi_insert_desc(dev->msi.data, desc, idx);
- if (ret)
- goto fail;
- }
- return 0;
-
-fail_mem:
- ret = -ENOMEM;
-fail:
- msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last);
- return ret;
+ return msi_insert_desc(dev, desc, domid, init_desc->msi_index);
}
static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
@@ -138,28 +160,97 @@ static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
return false;
}
+static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
+{
+ unsigned int hwsize;
+
+ if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
+ (dev->msi.domain &&
+ !dev->msi.data->__domains[ctrl->domid].domain)))
+ return false;
+
+ hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
+ if (WARN_ON_ONCE(ctrl->first > ctrl->last ||
+ ctrl->first >= hwsize ||
+ ctrl->last >= hwsize))
+ return false;
+ return true;
+}
+
+static void msi_domain_free_descs(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_desc *desc;
+ struct xarray *xa;
+ unsigned long idx;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ xa = &dev->msi.data->__domains[ctrl->domid].store;
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ xa_erase(xa, idx);
+
+ /* Leak the descriptor when it is still referenced */
+ if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
+ continue;
+ msi_free_desc(desc);
+ }
+}
+
/**
- * msi_free_msi_descs_range - Free MSI descriptors of a device
- * @dev: Device to free the descriptors
- * @filter: Descriptor state filter
- * @first_index: Index to start freeing from
- * @last_index: Last index to be freed
+ * msi_domain_free_msi_descs_range - Free a range of MSI descriptors of a device in an irqdomain
+ * @dev: Device for which to free the descriptors
+ * @domid: Id of the domain to operate on
+ * @first: Index to start freeing from (inclusive)
+ * @last: Last index to be freed (inclusive)
*/
-void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
- unsigned int first_index, unsigned int last_index)
+void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+
+ msi_domain_free_descs(dev, &ctrl);
+}
+
+/**
+ * msi_domain_add_simple_msi_descs - Allocate and initialize MSI descriptors
+ * @dev: Pointer to the device for which the descriptors are allocated
+ * @ctrl: Allocation control struct
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *ctrl)
{
- struct xarray *xa = &dev->msi.data->__store;
struct msi_desc *desc;
- unsigned long idx;
+ unsigned int idx;
+ int ret;
lockdep_assert_held(&dev->msi.data->mutex);
- xa_for_each_range(xa, idx, desc, first_index, last_index) {
- if (msi_desc_match(desc, filter)) {
- xa_erase(xa, idx);
- msi_free_desc(desc);
- }
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ for (idx = ctrl->first; idx <= ctrl->last; idx++) {
+ desc = msi_alloc_desc(dev, 1, NULL);
+ if (!desc)
+ goto fail_mem;
+ ret = msi_insert_desc(dev, desc, ctrl->domid, idx);
+ if (ret)
+ goto fail;
}
+ return 0;
+
+fail_mem:
+ ret = -ENOMEM;
+fail:
+ msi_domain_free_descs(dev, ctrl);
+ return ret;
}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -178,9 +269,13 @@ EXPORT_SYMBOL_GPL(get_cached_msi_msg);
static void msi_device_data_release(struct device *dev, void *res)
{
struct msi_device_data *md = res;
+ int i;
- WARN_ON_ONCE(!xa_empty(&md->__store));
- xa_destroy(&md->__store);
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) {
+ msi_remove_device_irq_domain(dev, i);
+ WARN_ON_ONCE(!xa_empty(&md->__domains[i].store));
+ xa_destroy(&md->__domains[i].store);
+ }
dev->msi.data = NULL;
}
@@ -197,7 +292,7 @@ static void msi_device_data_release(struct device *dev, void *res)
int msi_setup_device_data(struct device *dev)
{
struct msi_device_data *md;
- int ret;
+ int ret, i;
if (dev->msi.data)
return 0;
@@ -212,7 +307,18 @@ int msi_setup_device_data(struct device *dev)
return ret;
}
- xa_init(&md->__store);
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++)
+ xa_init_flags(&md->__domains[i].store, XA_FLAGS_ALLOC);
+
+ /*
+ * If @dev::msi::domain is set and is a global MSI domain, copy the
+ * pointer into the domain array so all code can operate on domain
+ * ids. The NULL pointer check is required to keep the legacy
+ * architecture specific PCI/MSI support working.
+ */
+ if (dev->msi.domain && !irq_domain_is_msi_parent(dev->msi.domain))
+ md->__domains[MSI_DEFAULT_DOMAIN].domain = dev->msi.domain;
+
mutex_init(&md->mutex);
dev->msi.data = md;
devres_add(dev, md);
@@ -235,27 +341,30 @@ EXPORT_SYMBOL_GPL(msi_lock_descs);
*/
void msi_unlock_descs(struct device *dev)
{
- /* Invalidate the index wich was cached by the iterator */
- dev->msi.data->__iter_idx = MSI_MAX_INDEX;
+ /* Invalidate the index which was cached by the iterator */
+ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX;
mutex_unlock(&dev->msi.data->mutex);
}
EXPORT_SYMBOL_GPL(msi_unlock_descs);
-static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter)
+static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid,
+ enum msi_desc_filter filter)
{
+ struct xarray *xa = &md->__domains[domid].store;
struct msi_desc *desc;
- xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) {
+ xa_for_each_start(xa, md->__iter_idx, desc, md->__iter_idx) {
if (msi_desc_match(desc, filter))
return desc;
}
- md->__iter_idx = MSI_MAX_INDEX;
+ md->__iter_idx = MSI_XA_MAX_INDEX;
return NULL;
}
/**
- * msi_first_desc - Get the first MSI descriptor of a device
+ * msi_domain_first_desc - Get the first MSI descriptor of an irqdomain associated to a device
* @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
* @filter: Descriptor state filter
*
* Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
@@ -264,23 +373,26 @@ static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_
* Return: Pointer to the first MSI descriptor matching the search
* criteria, NULL if none found.
*/
-struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter)
+struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
{
struct msi_device_data *md = dev->msi.data;
- if (WARN_ON_ONCE(!md))
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
return NULL;
lockdep_assert_held(&md->mutex);
md->__iter_idx = 0;
- return msi_find_desc(md, filter);
+ return msi_find_desc(md, domid, filter);
}
-EXPORT_SYMBOL_GPL(msi_first_desc);
+EXPORT_SYMBOL_GPL(msi_domain_first_desc);
/**
* msi_next_desc - Get the next MSI descriptor of a device
* @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Descriptor state filter
*
* The first invocation of msi_next_desc() has to be preceeded by a
* successful invocation of __msi_first_desc(). Consecutive invocations are
@@ -290,11 +402,12 @@ EXPORT_SYMBOL_GPL(msi_first_desc);
* Return: Pointer to the next MSI descriptor matching the search
* criteria, NULL if none found.
*/
-struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
+struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
{
struct msi_device_data *md = dev->msi.data;
- if (WARN_ON_ONCE(!md))
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
return NULL;
lockdep_assert_held(&md->mutex);
@@ -303,30 +416,38 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
return NULL;
md->__iter_idx++;
- return msi_find_desc(md, filter);
+ return msi_find_desc(md, domid, filter);
}
EXPORT_SYMBOL_GPL(msi_next_desc);
/**
- * msi_get_virq - Return Linux interrupt number of a MSI interrupt
+ * msi_domain_get_virq - Lookup the Linux interrupt number for a MSI index on a interrupt domain
* @dev: Device to operate on
+ * @domid: Domain ID of the interrupt domain associated to the device
* @index: MSI interrupt index to look for (0-based)
*
* Return: The Linux interrupt number on success (> 0), 0 if not found
*/
-unsigned int msi_get_virq(struct device *dev, unsigned int index)
+unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index)
{
struct msi_desc *desc;
unsigned int ret = 0;
- bool pcimsi;
+ bool pcimsi = false;
+ struct xarray *xa;
if (!dev->msi.data)
return 0;
- pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false;
+ if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return 0;
+
+ /* This check is only valid for the PCI default MSI domain */
+ if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN)
+ pcimsi = to_pci_dev(dev)->msi_enabled;
msi_lock_descs(dev);
- desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index);
+ xa = &dev->msi.data->__domains[domid].store;
+ desc = xa_load(xa, pcimsi ? 0 : index);
if (desc && desc->irq) {
/*
* PCI-MSI has only one descriptor for multiple interrupts.
@@ -340,10 +461,11 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index)
ret = desc->irq;
}
}
+
msi_unlock_descs(dev);
return ret;
}
-EXPORT_SYMBOL_GPL(msi_get_virq);
+EXPORT_SYMBOL_GPL(msi_domain_get_virq);
#ifdef CONFIG_SYSFS
static struct attribute *msi_dev_attrs[] = {
@@ -459,7 +581,39 @@ static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *d
static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
#endif /* !CONFIG_SYSFS */
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid)
+{
+ struct irq_domain *domain;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ domain = dev->msi.data->__domains[domid].domain;
+ if (!domain)
+ return NULL;
+
+ if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
+ return NULL;
+
+ return domain;
+}
+
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (domain) {
+ info = domain->host_data;
+ return info->hwsize;
+ }
+ /* No domain, default to MSI_XA_DOMAIN_SIZE */
+ return MSI_XA_DOMAIN_SIZE;
+}
+
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
{
@@ -613,21 +767,11 @@ static int msi_domain_ops_init(struct irq_domain *domain,
return 0;
}
-static int msi_domain_ops_check(struct irq_domain *domain,
- struct msi_domain_info *info,
- struct device *dev)
-{
- return 0;
-}
-
static struct msi_domain_ops msi_domain_ops_default = {
.get_hwirq = msi_domain_ops_get_hwirq,
.msi_init = msi_domain_ops_init,
- .msi_check = msi_domain_ops_check,
.msi_prepare = msi_domain_ops_prepare,
.set_desc = msi_domain_ops_set_desc,
- .domain_alloc_irqs = __msi_domain_alloc_irqs,
- .domain_free_irqs = __msi_domain_free_irqs,
};
static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -639,11 +783,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
return;
}
- if (ops->domain_alloc_irqs == NULL)
- ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
- if (ops->domain_free_irqs == NULL)
- ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;
-
if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
return;
@@ -651,8 +790,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
ops->get_hwirq = msi_domain_ops_default.get_hwirq;
if (ops->msi_init == NULL)
ops->msi_init = msi_domain_ops_default.msi_init;
- if (ops->msi_check == NULL)
- ops->msi_check = msi_domain_ops_default.msi_check;
if (ops->msi_prepare == NULL)
ops->msi_prepare = msi_domain_ops_default.msi_prepare;
if (ops->set_desc == NULL)
@@ -668,6 +805,40 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
chip->irq_set_affinity = msi_domain_set_affinity;
}
+static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
+ struct msi_domain_info *info,
+ unsigned int flags,
+ struct irq_domain *parent)
+{
+ struct irq_domain *domain;
+
+ if (info->hwsize > MSI_XA_DOMAIN_SIZE)
+ return NULL;
+
+ /*
+ * Hardware size 0 is valid for backwards compatibility and for
+ * domains which are not backed by a hardware table. Grant the
+ * maximum index space.
+ */
+ if (!info->hwsize)
+ info->hwsize = MSI_XA_DOMAIN_SIZE;
+
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,
+ fwnode, &msi_domain_ops, info);
+
+ if (domain) {
+ if (!domain->name && info->chip)
+ domain->name = info->chip->name;
+ irq_domain_update_bus_token(domain, info->bus_token);
+ }
+
+ return domain;
+}
+
/**
* msi_create_irq_domain - Create an MSI interrupt domain
* @fwnode: Optional fwnode of the interrupt controller
@@ -680,19 +851,214 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent)
{
+ return __msi_create_irq_domain(fwnode, info, 0, parent);
+}
+
+/**
+ * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down
+ * in the domain hierarchy
+ * @dev: The device for which the domain should be created
+ * @domain: The domain in the hierarchy this op is being called on
+ * @msi_parent_domain: The IRQ_DOMAIN_FLAG_MSI_PARENT domain for the child to
+ * be created
+ * @msi_child_info: The MSI domain info of the IRQ_DOMAIN_FLAG_MSI_DEVICE
+ * domain to be created
+ *
+ * Return: true on success, false otherwise
+ *
+ * This is the most complex problem of per device MSI domains and the
+ * underlying interrupt domain hierarchy:
+ *
+ * The device domain to be initialized requests the broadest feature set
+ * possible and the underlying domain hierarchy puts restrictions on it.
+ *
+ * That's trivial for a simple parent->child relationship, but it gets
+ * interesting with an intermediate domain: root->parent->child. The
+ * intermediate 'parent' can expand the capabilities which the 'root'
+ * domain is providing. So that creates a classic hen and egg problem:
+ * Which entity is doing the restrictions/expansions?
+ *
+ * One solution is to let the root domain handle the initialization that's
+ * why there is the @domain and the @msi_parent_domain pointer.
+ */
+bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *msi_parent_domain,
+ struct msi_domain_info *msi_child_info)
+{
+ struct irq_domain *parent = domain->parent;
+
+ if (WARN_ON_ONCE(!parent || !parent->msi_parent_ops ||
+ !parent->msi_parent_ops->init_dev_msi_info))
+ return false;
+
+ return parent->msi_parent_ops->init_dev_msi_info(dev, parent, msi_parent_domain,
+ msi_child_info);
+}
+
+/**
+ * msi_create_device_irq_domain - Create a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @template: MSI domain info bundle used as template
+ * @hwsize: Maximum number of MSI table entries (0 if unknown or unlimited)
+ * @domain_data: Optional pointer to domain specific data which is set in
+ * msi_domain_info::data
+ * @chip_data: Optional pointer to chip specific data which is set in
+ * msi_domain_info::chip_data
+ *
+ * Return: True on success, false otherwise
+ *
+ * There is no firmware node required for this interface because the per
+ * device domains are software constructs which are actually closer to the
+ * hardware reality than any firmware can describe them.
+ *
+ * The domain name and the irq chip name for a MSI device domain are
+ * composed by: "$(PREFIX)$(CHIPNAME)-$(DEVNAME)"
+ *
+ * $PREFIX: Optional prefix provided by the underlying MSI parent domain
+ * via msi_parent_ops::prefix. If that pointer is NULL the prefix
+ * is empty.
+ * $CHIPNAME: The name of the irq_chip in @template
+ * $DEVNAME: The name of the device
+ *
+ * This results in understandable chip names and hardware interrupt numbers
+ * in e.g. /proc/interrupts
+ *
+ * PCI-MSI-0000:00:1c.0 0-edge Parent domain has no prefix
+ * IR-PCI-MSI-0000:00:1c.4 0-edge Same with interrupt remapping prefix 'IR-'
+ *
+ * IR-PCI-MSIX-0000:3d:00.0 0-edge Hardware interrupt numbers reflect
+ * IR-PCI-MSIX-0000:3d:00.0 1-edge the real MSI-X index on that device
+ * IR-PCI-MSIX-0000:3d:00.0 2-edge
+ *
+ * On IMS domains the hardware interrupt number is either a table entry
+ * index or a purely software managed index but it is guaranteed to be
+ * unique.
+ *
+ * The domain pointer is stored in @dev::msi::data::__irqdomains[]. All
+ * subsequent operations on the domain depend on the domain id.
+ *
+ * The domain is automatically freed when the device is removed via devres
+ * in the context of @dev::msi::data freeing, but it can also be
+ * independently removed via @msi_remove_device_irq_domain().
+ */
+bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
+ const struct msi_domain_template *template,
+ unsigned int hwsize, void *domain_data,
+ void *chip_data)
+{
+ struct irq_domain *domain, *parent = dev->msi.domain;
+ const struct msi_parent_ops *pops;
+ struct msi_domain_template *bundle;
+ struct fwnode_handle *fwnode;
+
+ if (!irq_domain_is_msi_parent(parent))
+ return false;
+
+ if (domid >= MSI_MAX_DEVICE_IRQDOMAINS)
+ return false;
+
+ bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL);
+ if (!bundle)
+ return false;
+
+ bundle->info.hwsize = hwsize;
+ bundle->info.chip = &bundle->chip;
+ bundle->info.ops = &bundle->ops;
+ bundle->info.data = domain_data;
+ bundle->info.chip_data = chip_data;
+
+ pops = parent->msi_parent_ops;
+ snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
+ pops->prefix ? : "", bundle->chip.name, dev_name(dev));
+ bundle->chip.name = bundle->name;
+
+ fwnode = irq_domain_alloc_named_fwnode(bundle->name);
+ if (!fwnode)
+ goto free_bundle;
+
+ if (msi_setup_device_data(dev))
+ goto free_fwnode;
+
+ msi_lock_descs(dev);
+
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
+ goto fail;
+
+ if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
+ goto fail;
+
+ domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
+ if (!domain)
+ goto fail;
+
+ domain->dev = dev;
+ dev->msi.data->__domains[domid].domain = domain;
+ msi_unlock_descs(dev);
+ return true;
+
+fail:
+ msi_unlock_descs(dev);
+free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+free_bundle:
+ kfree(bundle);
+ return false;
+}
+
+/**
+ * msi_remove_device_irq_domain - Free a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ */
+void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
+{
+ struct fwnode_handle *fwnode = NULL;
+ struct msi_domain_info *info;
struct irq_domain *domain;
- msi_domain_update_dom_ops(info);
- if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
- msi_domain_update_chip_ops(info);
+ msi_lock_descs(dev);
- domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
- fwnode, &msi_domain_ops, info);
+ domain = msi_get_device_domain(dev, domid);
- if (domain && !domain->name && info->chip)
- domain->name = info->chip->name;
+ if (!domain || !irq_domain_is_msi_device(domain))
+ goto unlock;
- return domain;
+ dev->msi.data->__domains[domid].domain = NULL;
+ info = domain->host_data;
+ if (irq_domain_is_msi_device(domain))
+ fwnode = domain->fwnode;
+ irq_domain_remove(domain);
+ irq_domain_free_fwnode(fwnode);
+ kfree(container_of(info, struct msi_domain_template, info));
+
+unlock:
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_match_device_irq_domain - Match a device irq domain against a bus token
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @bus_token: Bus token to match against the domain bus token
+ *
+ * Return: True if device domain exists and bus tokens match.
+ */
+bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
+ enum irq_domain_bus_token bus_token)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+ bool ret = false;
+
+ msi_lock_descs(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (domain && irq_domain_is_msi_device(domain)) {
+ info = domain->host_data;
+ ret = info->bus_token == bus_token;
+ }
+ msi_unlock_descs(dev);
+ return ret;
}
int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
@@ -700,13 +1066,8 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- int ret;
-
- ret = ops->msi_check(domain, info, dev);
- if (ret == 0)
- ret = ops->msi_prepare(domain, dev, nvec, arg);
- return ret;
+ return ops->msi_prepare(domain, dev, nvec, arg);
}
int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
@@ -714,16 +1075,27 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
+ struct msi_ctrl ctrl = {
+ .domid = MSI_DEFAULT_DOMAIN,
+ .first = virq_base,
+ .last = virq_base + nvec - 1,
+ };
struct msi_desc *desc;
+ struct xarray *xa;
int ret, virq;
+ if (!msi_ctrl_valid(dev, &ctrl))
+ return -EINVAL;
+
msi_lock_descs(dev);
- ret = msi_add_simple_msi_descs(dev, virq_base, nvec);
+ ret = msi_domain_add_simple_msi_descs(dev, &ctrl);
if (ret)
goto unlock;
+ xa = &dev->msi.data->__domains[ctrl.domid].store;
+
for (virq = virq_base; virq < virq_base + nvec; virq++) {
- desc = xa_load(&dev->msi.data->__store, virq);
+ desc = xa_load(xa, virq);
desc->irq = virq;
ops->set_desc(arg, desc);
@@ -739,7 +1111,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
fail:
for (--virq; virq >= virq_base; virq--)
irq_domain_free_irqs_common(domain, virq, 1);
- msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1);
+ msi_domain_free_descs(dev, &ctrl);
unlock:
msi_unlock_descs(dev);
return ret;
@@ -764,6 +1136,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
switch(domain->bus_token) {
case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
case DOMAIN_BUS_VMD_MSI:
break;
default:
@@ -789,6 +1163,8 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
{
switch(domain->bus_token) {
case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
case DOMAIN_BUS_VMD_MSI:
if (IS_ENABLED(CONFIG_PCI_MSI))
break;
@@ -850,18 +1226,19 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
return 0;
}
-int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
+ unsigned int vflags = 0, allocated = 0;
msi_alloc_info_t arg = { };
- unsigned int vflags = 0;
struct msi_desc *desc;
- int allocated = 0;
+ unsigned long idx;
int i, ret, virq;
- ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
+ ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
@@ -883,11 +1260,21 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
* MSI affinity setting requires a special quirk (X86) when
* reservation mode is active.
*/
- if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ if (info->flags & MSI_FLAG_NOMASK_QUIRK)
vflags |= VIRQ_NOMASK_QUIRK;
}
- msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) {
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
+ continue;
+
+ /* This should return -ECONFUSED... */
+ if (WARN_ON_ONCE(allocated >= ctrl->nirqs))
+ return -EINVAL;
+
+ if (ops->prepare_desc)
+ ops->prepare_desc(domain, &arg, desc);
+
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
@@ -913,76 +1300,213 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
return 0;
}
-static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info,
- struct device *dev,
- unsigned int num_descs)
+static int msi_domain_alloc_simple_msi_descs(struct device *dev,
+ struct msi_domain_info *info,
+ struct msi_ctrl *ctrl)
{
if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
return 0;
- return msi_add_simple_msi_descs(dev, 0, num_descs);
+ return msi_domain_add_simple_msi_descs(dev, ctrl);
+}
+
+static int __msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+ int ret;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return -ENODEV;
+
+ info = domain->host_data;
+
+ ret = msi_domain_alloc_simple_msi_descs(dev, info, ctrl);
+ if (ret)
+ return ret;
+
+ ops = info->ops;
+ if (ops->domain_alloc_irqs)
+ return ops->domain_alloc_irqs(domain, dev, ctrl->nirqs);
+
+ return __msi_domain_alloc_irqs(dev, domain, ctrl);
+}
+
+static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ int ret = __msi_domain_alloc_locked(dev, ctrl);
+
+ if (ret)
+ msi_domain_free_locked(dev, ctrl);
+ return ret;
}
/**
- * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
+ * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
- * @nvec: The number of interrupts to allocate
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
*
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
- * pair. Use this for MSI irqdomains which implement their own vector
+ * pair. Use this for MSI irqdomains which implement their own descriptor
* allocation/free.
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
- int nvec)
+int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
- int ret;
-
- lockdep_assert_held(&dev->msi.data->mutex);
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ .nirqs = last + 1 - first,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
- ret = msi_domain_add_simple_msi_descs(info, dev, nvec);
- if (ret)
- return ret;
+/**
+ * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ int ret;
- ret = ops->domain_alloc_irqs(domain, dev, nvec);
- if (ret)
- msi_domain_free_irqs_descs_locked(domain, dev);
+ msi_lock_descs(dev);
+ ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
return ret;
}
/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
+ * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain
+ *
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
- * @nvec: The number of interrupts to allocate
+ * @domid: Id of the interrupt domain to operate on
+ * @nirqs: The number of interrupts to allocate
+ *
+ * This function scans all MSI descriptors of the MSI domain and allocates interrupts
+ * for all unassigned ones. That function is to be used for MSI domain usage where
+ * the descriptor allocation is handled at the call site, e.g. PCI/MSI[X].
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
+int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs)
{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = 0,
+ .last = msi_domain_get_hwsize(dev, domid) - 1,
+ .nirqs = nirqs,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at
+ * a given index - or at the next free index
+ *
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation
+ * uses the next free index.
+ * @affdesc: Optional pointer to an interrupt affinity descriptor structure
+ * @icookie: Optional pointer to a domain specific per instance cookie. If
+ * non-NULL the content of the cookie is stored in msi_desc::data.
+ * Must be NULL for MSI-X allocations
+ *
+ * This requires a MSI interrupt domain which lets the core code manage the
+ * MSI descriptors.
+ *
+ * Return: struct msi_map
+ *
+ * On success msi_map::index contains the allocated index number and
+ * msi_map::virq the corresponding Linux interrupt number
+ *
+ * On failure msi_map::index contains the error code and msi_map::virq
+ * is %0.
+ */
+struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
+{
+ struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, };
+ struct irq_domain *domain;
+ struct msi_map map = { };
+ struct msi_desc *desc;
int ret;
msi_lock_descs(dev);
- ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec);
+ domain = msi_get_device_domain(dev, domid);
+ if (!domain) {
+ map.index = -ENODEV;
+ goto unlock;
+ }
+
+ desc = msi_alloc_desc(dev, 1, affdesc);
+ if (!desc) {
+ map.index = -ENOMEM;
+ goto unlock;
+ }
+
+ if (icookie)
+ desc->data.icookie = *icookie;
+
+ ret = msi_insert_desc(dev, desc, domid, index);
+ if (ret) {
+ map.index = ret;
+ goto unlock;
+ }
+
+ ctrl.first = ctrl.last = desc->msi_index;
+
+ ret = __msi_domain_alloc_irqs(dev, domain, &ctrl);
+ if (ret) {
+ map.index = ret;
+ msi_domain_free_locked(dev, &ctrl);
+ } else {
+ map.index = desc->msi_index;
+ map.virq = desc->irq;
+ }
+unlock:
msi_unlock_descs(dev);
- return ret;
+ return map;
}
-void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct irq_data *irqd;
struct msi_desc *desc;
+ unsigned long idx;
int i;
- /* Only handle MSI entries which have an interrupt associated */
- msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ /* Only handle MSI entries which have an interrupt associated */
+ if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED))
+ continue;
+
/* Make sure all interrupts are deactivated */
for (i = 0; i < desc->nvec_used; i++) {
irqd = irq_domain_get_irq_data(domain, desc->irq + i);
@@ -997,44 +1521,99 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
}
}
-static void msi_domain_free_msi_descs(struct msi_domain_info *info,
- struct device *dev)
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return;
+
+ info = domain->host_data;
+ ops = info->ops;
+
+ if (ops->domain_free_irqs)
+ ops->domain_free_irqs(domain, dev);
+ else
+ __msi_domain_free_irqs(dev, domain, ctrl);
+
+ if (ops->msi_post_free)
+ ops->msi_post_free(domain, dev);
+
if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
- msi_free_msi_descs(dev);
+ msi_domain_free_descs(dev, ctrl);
}
/**
- * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev with msi_lock held
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+ msi_domain_free_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
*
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
* pair. Use this for MSI irqdomains which implement their own vector
* allocation.
*/
-void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
-
- lockdep_assert_held(&dev->msi.data->mutex);
-
- ops->domain_free_irqs(domain, dev);
- msi_domain_free_msi_descs(info, dev);
+ msi_domain_free_irqs_range_locked(dev, domid, 0,
+ msi_domain_get_hwsize(dev, domid) - 1);
}
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain
+ * associated to a device
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are freed
+ * @domid: The id of the domain to operate on
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
{
msi_lock_descs(dev);
- msi_domain_free_irqs_descs_locked(domain, dev);
+ msi_domain_free_irqs_all_locked(dev, domid);
msi_unlock_descs(dev);
}
@@ -1048,5 +1627,3 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
{
return (struct msi_domain_info *)domain->host_data;
}
-
-#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 714ac4c3b556..d9c822bbffb8 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -113,11 +113,40 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-void static_key_slow_inc_cpuslocked(struct static_key *key)
+/*
+ * static_key_fast_inc_not_disabled - adds a user for a static key
+ * @key: static key that must be already enabled
+ *
+ * The caller must make sure that the static key can't get disabled while
+ * in this function. It doesn't patch jump labels, only adds a user to
+ * an already enabled static key.
+ *
+ * Returns true if the increment was done. Unlike refcount_t the ref counter
+ * is not saturated, but will fail to increment on overflow.
+ */
+bool static_key_fast_inc_not_disabled(struct static_key *key)
{
- int v, v1;
+ int v;
STATIC_KEY_CHECK_USE(key);
+ /*
+ * Negative key->enabled has a special meaning: it sends
+ * static_key_slow_inc() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update(). Note that
+ * atomic_inc_unless_negative() checks >= 0, so roll our own.
+ */
+ v = atomic_read(&key->enabled);
+ do {
+ if (v <= 0 || (v + 1) < 0)
+ return false;
+ } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled);
+
+bool static_key_slow_inc_cpuslocked(struct static_key *key)
+{
lockdep_assert_cpus_held();
/*
@@ -126,17 +155,9 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
* jump_label_update() process. At the same time, however,
* the jump_label_update() call below wants to see
* static_key_enabled(&key) for jumps to be updated properly.
- *
- * So give a special meaning to negative key->enabled: it sends
- * static_key_slow_inc() down the slow path, and it is non-zero
- * so it counts as "enabled" in jump_label_update(). Note that
- * atomic_inc_unless_negative() checks >= 0, so roll our own.
*/
- for (v = atomic_read(&key->enabled); v > 0; v = v1) {
- v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
- if (likely(v1 == v))
- return;
- }
+ if (static_key_fast_inc_not_disabled(key))
+ return true;
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
@@ -148,16 +169,23 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
*/
atomic_set_release(&key->enabled, 1);
} else {
- atomic_inc(&key->enabled);
+ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) {
+ jump_label_unlock();
+ return false;
+ }
}
jump_label_unlock();
+ return true;
}
-void static_key_slow_inc(struct static_key *key)
+bool static_key_slow_inc(struct static_key *key)
{
+ bool ret;
+
cpus_read_lock();
- static_key_slow_inc_cpuslocked(key);
+ ret = static_key_slow_inc_cpuslocked(key);
cpus_read_unlock();
+ return ret;
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 60c20f301a6b..83f499182c9a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -146,7 +146,7 @@ static unsigned int get_symbol_offset(unsigned long pos)
return name - kallsyms_names;
}
-static unsigned long kallsyms_sym_address(int idx)
+unsigned long kallsyms_sym_address(int idx)
{
if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
return kallsyms_addresses[idx];
@@ -187,26 +187,100 @@ static bool cleanup_symbol_name(char *s)
return false;
}
+static int compare_symbol_name(const char *name, char *namebuf)
+{
+ int ret;
+
+ ret = strcmp(name, namebuf);
+ if (!ret)
+ return ret;
+
+ if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf))
+ return 0;
+
+ return ret;
+}
+
+static unsigned int get_symbol_seq(int index)
+{
+ unsigned int i, seq = 0;
+
+ for (i = 0; i < 3; i++)
+ seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i];
+
+ return seq;
+}
+
+static int kallsyms_lookup_names(const char *name,
+ unsigned int *start,
+ unsigned int *end)
+{
+ int ret;
+ int low, mid, high;
+ unsigned int seq, off;
+ char namebuf[KSYM_NAME_LEN];
+
+ low = 0;
+ high = kallsyms_num_syms - 1;
+
+ while (low <= high) {
+ mid = low + (high - low) / 2;
+ seq = get_symbol_seq(mid);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ ret = compare_symbol_name(name, namebuf);
+ if (ret > 0)
+ low = mid + 1;
+ else if (ret < 0)
+ high = mid - 1;
+ else
+ break;
+ }
+
+ if (low > high)
+ return -ESRCH;
+
+ low = mid;
+ while (low) {
+ seq = get_symbol_seq(low - 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (compare_symbol_name(name, namebuf))
+ break;
+ low--;
+ }
+ *start = low;
+
+ if (end) {
+ high = mid;
+ while (high < kallsyms_num_syms - 1) {
+ seq = get_symbol_seq(high + 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (compare_symbol_name(name, namebuf))
+ break;
+ high++;
+ }
+ *end = high;
+ }
+
+ return 0;
+}
+
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
- char namebuf[KSYM_NAME_LEN];
- unsigned long i;
- unsigned int off;
+ int ret;
+ unsigned int i;
/* Skip the search for empty string. */
if (!*name)
return 0;
- for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
-
- if (strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
+ ret = kallsyms_lookup_names(name, &i, NULL);
+ if (!ret)
+ return kallsyms_sym_address(get_symbol_seq(i));
- if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
- }
return module_kallsyms_lookup_name(name);
}
@@ -233,6 +307,24 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
return 0;
}
+int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long),
+ const char *name, void *data)
+{
+ int ret;
+ unsigned int i, start, end;
+
+ ret = kallsyms_lookup_names(name, &start, &end);
+ if (ret)
+ return 0;
+
+ for (i = start; !ret && i <= end; i++) {
+ ret = fn(data, kallsyms_sym_address(get_symbol_seq(i)));
+ cond_resched();
+ }
+
+ return ret;
+}
+
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset)
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
index 2d0c6f2f0243..27fabdcc40f5 100644
--- a/kernel/kallsyms_internal.h
+++ b/kernel/kallsyms_internal.h
@@ -26,5 +26,6 @@ extern const char kallsyms_token_table[] __weak;
extern const u16 kallsyms_token_index[] __weak;
extern const unsigned int kallsyms_markers[] __weak;
+extern const u8 kallsyms_seqs_of_names[] __weak;
#endif // LINUX_KALLSYMS_INTERNAL_H_
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
new file mode 100644
index 000000000000..bfbc12da3326
--- /dev/null
+++ b/kernel/kallsyms_selftest.c
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test the function and performance of kallsyms
+ *
+ * Copyright (C) Huawei Technologies Co., Ltd., 2022
+ *
+ * Authors: Zhen Lei <thunder.leizhen@huawei.com> Huawei
+ */
+
+#define pr_fmt(fmt) "kallsyms_selftest: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/random.h>
+#include <linux/sched/clock.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+
+#include "kallsyms_internal.h"
+#include "kallsyms_selftest.h"
+
+
+#define MAX_NUM_OF_RECORDS 64
+
+struct test_stat {
+ int min;
+ int max;
+ int save_cnt;
+ int real_cnt;
+ int perf;
+ u64 sum;
+ char *name;
+ unsigned long addr;
+ unsigned long addrs[MAX_NUM_OF_RECORDS];
+};
+
+struct test_item {
+ char *name;
+ unsigned long addr;
+};
+
+#define ITEM_FUNC(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)s, \
+ }
+
+#define ITEM_DATA(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)&s, \
+ }
+
+
+static int kallsyms_test_var_bss_static;
+static int kallsyms_test_var_data_static = 1;
+int kallsyms_test_var_bss;
+int kallsyms_test_var_data = 1;
+
+static int kallsyms_test_func_static(void)
+{
+ kallsyms_test_var_bss_static++;
+ kallsyms_test_var_data_static++;
+
+ return 0;
+}
+
+int kallsyms_test_func(void)
+{
+ return kallsyms_test_func_static();
+}
+
+__weak int kallsyms_test_func_weak(void)
+{
+ kallsyms_test_var_bss++;
+ kallsyms_test_var_data++;
+ return 0;
+}
+
+static struct test_item test_items[] = {
+ ITEM_FUNC(kallsyms_test_func_static),
+ ITEM_FUNC(kallsyms_test_func),
+ ITEM_FUNC(kallsyms_test_func_weak),
+ ITEM_FUNC(vmalloc),
+ ITEM_FUNC(vfree),
+#ifdef CONFIG_KALLSYMS_ALL
+ ITEM_DATA(kallsyms_test_var_bss_static),
+ ITEM_DATA(kallsyms_test_var_data_static),
+ ITEM_DATA(kallsyms_test_var_bss),
+ ITEM_DATA(kallsyms_test_var_data),
+ ITEM_DATA(vmap_area_list),
+#endif
+};
+
+static char stub_name[KSYM_NAME_LEN];
+
+static int stat_symbol_len(void *data, const char *name, struct module *mod, unsigned long addr)
+{
+ *(u32 *)data += strlen(name);
+
+ return 0;
+}
+
+static void test_kallsyms_compression_ratio(void)
+{
+ u32 pos, off, len, num;
+ u32 ratio, total_size, total_len = 0;
+
+ kallsyms_on_each_symbol(stat_symbol_len, &total_len);
+
+ /*
+ * A symbol name cannot start with a number. This stub name helps us
+ * traverse the entire symbol table without finding a match. It's used
+ * for subsequent performance tests, and its length is the average
+ * length of all symbol names.
+ */
+ memset(stub_name, '4', sizeof(stub_name));
+ pos = total_len / kallsyms_num_syms;
+ stub_name[pos] = 0;
+
+ pos = 0;
+ num = 0;
+ off = 0;
+ while (pos < kallsyms_num_syms) {
+ len = kallsyms_names[off];
+ num++;
+ off++;
+ pos++;
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7f) | (kallsyms_names[off] << 7);
+ num++;
+ off++;
+ }
+ off += len;
+ }
+
+ /*
+ * 1. The length fields is not counted
+ * 2. The memory occupied by array kallsyms_token_table[] and
+ * kallsyms_token_index[] needs to be counted.
+ */
+ total_size = off - num;
+ pos = kallsyms_token_index[0xff];
+ total_size += pos + strlen(&kallsyms_token_table[pos]) + 1;
+ total_size += 0x100 * sizeof(u16);
+
+ pr_info(" ---------------------------------------------------------\n");
+ pr_info("| nr_symbols | compressed size | original size | ratio(%%) |\n");
+ pr_info("|---------------------------------------------------------|\n");
+ ratio = (u32)div_u64(10000ULL * total_size, total_len);
+ pr_info("| %10d | %10d | %10d | %2d.%-2d |\n",
+ kallsyms_num_syms, total_size, total_len, ratio / 100, ratio % 100);
+ pr_info(" ---------------------------------------------------------\n");
+}
+
+static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr)
+{
+ u64 t0, t1, t;
+ struct test_stat *stat = (struct test_stat *)data;
+
+ t0 = ktime_get_ns();
+ (void)kallsyms_lookup_name(name);
+ t1 = ktime_get_ns();
+
+ t = t1 - t0;
+ if (t < stat->min)
+ stat->min = t;
+
+ if (t > stat->max)
+ stat->max = t;
+
+ stat->real_cnt++;
+ stat->sum += t;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_lookup_name(void)
+{
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.min = INT_MAX;
+ kallsyms_on_each_symbol(lookup_name, &stat);
+ pr_info("kallsyms_lookup_name() looked up %d symbols\n", stat.real_cnt);
+ pr_info("The time spent on each symbol is (ns): min=%d, max=%d, avg=%lld\n",
+ stat.min, stat.max, div_u64(stat.sum, stat.real_cnt));
+}
+
+static bool match_cleanup_name(const char *s, const char *name)
+{
+ char *p;
+ int len;
+
+ if (!IS_ENABLED(CONFIG_LTO_CLANG))
+ return false;
+
+ p = strchr(s, '.');
+ if (!p)
+ return false;
+
+ len = strlen(name);
+ if (p - s != len)
+ return false;
+
+ return !strncmp(s, name, len);
+}
+
+static int find_symbol(void *data, const char *name, struct module *mod, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ if (strcmp(name, stat->name) == 0 ||
+ (!stat->perf && match_cleanup_name(name, stat->name))) {
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_symbol(void)
+{
+ u64 t0, t1;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ stat.perf = 1;
+ t0 = ktime_get_ns();
+ kallsyms_on_each_symbol(find_symbol, &stat);
+ t1 = ktime_get_ns();
+ pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int match_symbol(void *data, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_match_symbol(void)
+{
+ u64 t0, t1;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ t0 = ktime_get_ns();
+ kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat);
+ t1 = ktime_get_ns();
+ pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int test_kallsyms_basic_function(void)
+{
+ int i, j, ret;
+ int next = 0, nr_failed = 0;
+ char *prefix;
+ unsigned short rand;
+ unsigned long addr, lookup_addr;
+ char namebuf[KSYM_NAME_LEN];
+ struct test_stat *stat, *stat2;
+
+ stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL);
+ if (!stat)
+ return -ENOMEM;
+ stat2 = stat + 1;
+
+ prefix = "kallsyms_lookup_name() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ addr = kallsyms_lookup_name(test_items[i].name);
+ if (addr != test_items[i].addr) {
+ nr_failed++;
+ pr_info("%s %s failed: addr=%lx, expect %lx\n",
+ prefix, test_items[i].name, addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_symbol(find_symbol, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_match_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_match_symbol(match_symbol, test_items[i].name, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ if (nr_failed) {
+ kfree(stat);
+ return -ESRCH;
+ }
+
+ for (i = 0; i < kallsyms_num_syms; i++) {
+ addr = kallsyms_sym_address(i);
+ if (!is_ksym_addr(addr))
+ continue;
+
+ ret = lookup_symbol_name(addr, namebuf);
+ if (unlikely(ret)) {
+ namebuf[0] = 0;
+ goto failed;
+ }
+
+ /*
+ * The first '.' may be the initial letter, in which case the
+ * entire symbol name will be truncated to an empty string in
+ * cleanup_symbol_name(). Do not test these symbols.
+ *
+ * For example:
+ * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head
+ * .E_read_words
+ * .E_leading_bytes
+ * .E_trailing_bytes
+ * .E_write_words
+ * .E_copy
+ * .str.292.llvm.12122243386960820698
+ * .str.24.llvm.12122243386960820698
+ * .str.29.llvm.12122243386960820698
+ * .str.75.llvm.12122243386960820698
+ * .str.99.llvm.12122243386960820698
+ */
+ if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0])
+ continue;
+
+ lookup_addr = kallsyms_lookup_name(namebuf);
+
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ kallsyms_on_each_match_symbol(match_symbol, namebuf, stat);
+
+ /*
+ * kallsyms_on_each_symbol() is too slow, randomly select some
+ * symbols for test.
+ */
+ if (i >= next) {
+ memset(stat2, 0, sizeof(*stat2));
+ stat2->max = INT_MAX;
+ stat2->name = namebuf;
+ kallsyms_on_each_symbol(find_symbol, stat2);
+
+ /*
+ * kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()
+ * need to get the same traversal result.
+ */
+ if (stat->addr != stat2->addr ||
+ stat->real_cnt != stat2->real_cnt ||
+ memcmp(stat->addrs, stat2->addrs,
+ stat->save_cnt * sizeof(stat->addrs[0])))
+ goto failed;
+
+ /*
+ * The average of random increments is 128, that is, one of
+ * them is tested every 128 symbols.
+ */
+ get_random_bytes(&rand, sizeof(rand));
+ next = i + (rand & 0xff) + 1;
+ }
+
+ /* Need to be found at least once */
+ if (!stat->real_cnt)
+ goto failed;
+
+ /*
+ * kallsyms_lookup_name() returns the address of the first
+ * symbol found and cannot be NULL.
+ */
+ if (!lookup_addr || lookup_addr != stat->addrs[0])
+ goto failed;
+
+ /*
+ * If the addresses of all matching symbols are recorded, the
+ * target address needs to be exist.
+ */
+ if (stat->real_cnt <= MAX_NUM_OF_RECORDS) {
+ for (j = 0; j < stat->save_cnt; j++) {
+ if (stat->addrs[j] == addr)
+ break;
+ }
+
+ if (j == stat->save_cnt)
+ goto failed;
+ }
+ }
+
+ kfree(stat);
+
+ return 0;
+
+failed:
+ pr_info("Test for %dth symbol failed: (%s) addr=%lx", i, namebuf, addr);
+ kfree(stat);
+ return -ESRCH;
+}
+
+static int test_entry(void *p)
+{
+ int ret;
+
+ do {
+ schedule_timeout(5 * HZ);
+ } while (system_state != SYSTEM_RUNNING);
+
+ pr_info("start\n");
+ ret = test_kallsyms_basic_function();
+ if (ret) {
+ pr_info("abort\n");
+ return 0;
+ }
+
+ test_kallsyms_compression_ratio();
+ test_perf_kallsyms_lookup_name();
+ test_perf_kallsyms_on_each_symbol();
+ test_perf_kallsyms_on_each_match_symbol();
+ pr_info("finish\n");
+
+ return 0;
+}
+
+static int __init kallsyms_test_init(void)
+{
+ struct task_struct *t;
+
+ t = kthread_create(test_entry, NULL, "kallsyms_test");
+ if (IS_ERR(t)) {
+ pr_info("Create kallsyms selftest task failed\n");
+ return PTR_ERR(t);
+ }
+ kthread_bind(t, 0);
+ wake_up_process(t);
+
+ return 0;
+}
+late_initcall(kallsyms_test_init);
diff --git a/kernel/kallsyms_selftest.h b/kernel/kallsyms_selftest.h
new file mode 100644
index 000000000000..c0ca548e2a22
--- /dev/null
+++ b/kernel/kallsyms_selftest.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef LINUX_KALLSYMS_SELFTEST_H_
+#define LINUX_KALLSYMS_SELFTEST_H_
+
+#include <linux/types.h>
+
+extern int kallsyms_test_var_bss;
+extern int kallsyms_test_var_data;
+
+extern int kallsyms_test_func(void);
+extern int kallsyms_test_func_weak(void);
+
+#endif // LINUX_KALLSYMS_SELFTEST_H_
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 4f35d1bced6a..8cf70f068d92 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -17,4 +17,5 @@ KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN)
obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index dcec1b743c69..a60c561724be 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -159,7 +159,7 @@ static bool __report_matches(const struct expect_report *r)
const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
bool ret = false;
unsigned long flags;
- typeof(observed.lines) expect;
+ typeof(*observed.lines) *expect;
const char *end;
char *cur;
int i;
@@ -168,6 +168,10 @@ static bool __report_matches(const struct expect_report *r)
if (!report_available())
return false;
+ expect = kmalloc(sizeof(observed.lines), GFP_KERNEL);
+ if (WARN_ON(!expect))
+ return false;
+
/* Generate expected report contents. */
/* Title */
@@ -253,6 +257,7 @@ static bool __report_matches(const struct expect_report *r)
strstr(observed.lines[2], expect[1])));
out:
spin_unlock_irqrestore(&observed.lock, flags);
+ kfree(expect);
return ret;
}
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 67794404042a..e95ce7d7a76e 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -492,8 +492,7 @@ static void print_report(enum kcsan_value_change value_change,
dump_stack_print_info(KERN_DEFAULT);
pr_err("==================================================================\n");
- if (panic_on_warn)
- panic("panic_on_warn set ...\n");
+ check_panic_on_warn("KCSAN");
}
static void release_report(unsigned long *flags, struct other_info *other_info)
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 00cdf8fa5693..8679322450f2 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -22,13 +22,6 @@
#define ITERS_PER_TEST 2000
-/* Test requirements. */
-static bool __init test_requires(void)
-{
- /* random should be initialized for the below tests */
- return get_random_u32() + get_random_u32() != 0;
-}
-
/*
* Test watchpoint encode and decode: check that encoding some access's info,
* and then subsequent decode preserves the access's info.
@@ -38,8 +31,8 @@ static bool __init test_encode_decode(void)
int i;
for (i = 0; i < ITERS_PER_TEST; ++i) {
- size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
- bool is_write = !!prandom_u32_max(2);
+ size_t size = get_random_u32_inclusive(1, MAX_ENCODABLE_SIZE);
+ bool is_write = !!get_random_u32_below(2);
unsigned long verif_masked_addr;
long encoded_watchpoint;
bool verif_is_write;
@@ -259,7 +252,6 @@ static int __init kcsan_selftest(void)
pr_err("selftest: " #do_test " failed"); \
} while (0)
- RUN_TEST(test_requires);
RUN_TEST(test_encode_decode);
RUN_TEST(test_matching_access);
RUN_TEST(test_barrier);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index ca2743f9c634..b1cf259854ca 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -6,6 +6,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/btf.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
@@ -561,23 +562,17 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
static int kimage_set_destination(struct kimage *image,
unsigned long destination)
{
- int result;
-
destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
- return result;
+ return kimage_add_entry(image, destination | IND_DESTINATION);
}
static int kimage_add_page(struct kimage *image, unsigned long page)
{
- int result;
-
page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
- return result;
+ return kimage_add_entry(image, page | IND_SOURCE);
}
@@ -981,7 +976,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
}
STACK_FRAME_NON_STANDARD(__crash_kexec);
-void crash_kexec(struct pt_regs *regs)
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
{
int old_cpu, this_cpu;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 45637511e0de..dd5983010b7b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1141,7 +1141,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
{
int i, j;
unsigned long long start, end, p_start, p_end;
- struct crash_mem_range temp_range = {0, 0};
+ struct range temp_range = {0, 0};
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3050631e528d..00e177de91cc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -458,7 +458,7 @@ static inline int kprobe_optready(struct kprobe *p)
}
/* Return true if the kprobe is disarmed. Note: p must be on hash list */
-static inline bool kprobe_disarmed(struct kprobe *p)
+bool kprobe_disarmed(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -555,17 +555,15 @@ static void do_unoptimize_kprobes(void)
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held();
- /* Unoptimization must be done anytime */
- if (list_empty(&unoptimizing_list))
- return;
+ if (!list_empty(&unoptimizing_list))
+ arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- /* Loop on 'freeing_list' for disarming */
+ /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
/* Switching from detour code to origin */
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- /* Disarm probes if marked disabled */
- if (kprobe_disabled(&op->kp))
+ /* Disarm probes if marked disabled and not gone */
+ if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))
arch_disarm_kprobe(&op->kp);
if (kprobe_unused(&op->kp)) {
/*
@@ -662,7 +660,7 @@ void wait_for_kprobe_optimizer(void)
mutex_unlock(&kprobe_mutex);
}
-static bool optprobe_queued_unopt(struct optimized_kprobe *op)
+bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
struct optimized_kprobe *_op;
@@ -797,14 +795,13 @@ static void kill_optimized_kprobe(struct kprobe *p)
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_unused(p)) {
- /* Enqueue if it is unused */
- list_add(&op->list, &freeing_list);
/*
- * Remove unused probes from the hash list. After waiting
- * for synchronization, this probe is reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes().)
+ * Unused kprobe is on unoptimizing or freeing list. We move it
+ * to freeing_list and let the kprobe_optimizer() remove it from
+ * the kprobe hash list and free it.
*/
- hlist_del_rcu(&op->kp.hlist);
+ if (optprobe_queued_unopt(op))
+ list_move(&op->list, &freeing_list);
}
/* Don't touch the code, because it is already freed. */
@@ -2213,13 +2210,9 @@ int register_kretprobe(struct kretprobe *rp)
rp->kp.post_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
- if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPTION
+ if (rp->maxactive <= 0)
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
-#else
- rp->maxactive = num_possible_cpus();
-#endif
- }
+
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
if (!rp->rh)
@@ -2364,6 +2357,14 @@ static void kill_kprobe(struct kprobe *p)
lockdep_assert_held(&kprobe_mutex);
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace, because ftrace framework is still available at
+ * 'MODULE_STATE_GOING' notification.
+ */
+ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
+ disarm_kprobe_ftrace(p);
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
@@ -2380,14 +2381,6 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
-
- /*
- * The module is going away. We should disarm the kprobe which
- * is using ftrace, because ftrace framework is still available at
- * 'MODULE_STATE_GOING' notification.
- */
- if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
- disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 65dba9076f31..2df00b789b90 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -6,6 +6,7 @@
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
*/
+#include <asm/byteorder.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
@@ -20,6 +21,14 @@
#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
+#if defined(__LITTLE_ENDIAN)
+#define CPU_BYTEORDER_STRING "little"
+#elif defined(__BIG_ENDIAN)
+#define CPU_BYTEORDER_STRING "big"
+#else
+#error Unknown byteorder
+#endif
+
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -34,6 +43,14 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(uevent_seqnum);
+/* cpu byteorder */
+static ssize_t cpu_byteorder_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING);
+}
+KERNEL_ATTR_RO(cpu_byteorder);
+
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
@@ -215,6 +232,7 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
+ &cpu_byteorder_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 9ada0bc5247b..c973ed9e42f8 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -118,27 +118,16 @@ static struct klp_object *klp_find_object(struct klp_patch *patch,
}
struct klp_find_arg {
- const char *objname;
const char *name;
unsigned long addr;
unsigned long count;
unsigned long pos;
};
-static int klp_find_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int klp_match_callback(void *data, unsigned long addr)
{
struct klp_find_arg *args = data;
- if ((mod && !args->objname) || (!mod && args->objname))
- return 0;
-
- if (strcmp(args->name, name))
- return 0;
-
- if (args->objname && strcmp(args->objname, mod->name))
- return 0;
-
args->addr = addr;
args->count++;
@@ -153,11 +142,21 @@ static int klp_find_callback(void *data, const char *name,
return 0;
}
+static int klp_find_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ return klp_match_callback(data, addr);
+}
+
static int klp_find_object_symbol(const char *objname, const char *name,
unsigned long sympos, unsigned long *addr)
{
struct klp_find_arg args = {
- .objname = objname,
.name = name,
.addr = 0,
.count = 0,
@@ -165,9 +164,9 @@ static int klp_find_object_symbol(const char *objname, const char *name,
};
if (objname)
- module_kallsyms_on_each_symbol(klp_find_callback, &args);
+ module_kallsyms_on_each_symbol(objname, klp_find_callback, &args);
else
- kallsyms_on_each_symbol(klp_find_callback, &args);
+ kallsyms_on_each_match_symbol(klp_match_callback, name, &args);
/*
* Ensure an address was found. If sympos is 0, ensure symbol is unique;
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 4c4f5a776d80..4152c71507e2 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
if (func->nop)
goto unlock;
- ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func);
+ ftrace_regs_set_instruction_pointer(fregs, (unsigned long)func->new_func);
unlock:
ftrace_test_recursion_unlock(bit);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 30187b1d8275..f1b25ec581e0 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -196,36 +196,36 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
struct klp_ops *ops;
int i;
- for (i = 0; i < nr_entries; i++) {
- address = entries[i];
+ if (klp_target_state == KLP_UNPATCHED) {
+ /*
+ * Check for the to-be-unpatched function
+ * (the func itself).
+ */
+ func_addr = (unsigned long)func->new_func;
+ func_size = func->new_size;
+ } else {
+ /*
+ * Check for the to-be-patched function
+ * (the previous func).
+ */
+ ops = klp_find_ops(func->old_func);
- if (klp_target_state == KLP_UNPATCHED) {
- /*
- * Check for the to-be-unpatched function
- * (the func itself).
- */
- func_addr = (unsigned long)func->new_func;
- func_size = func->new_size;
+ if (list_is_singular(&ops->func_stack)) {
+ /* original function */
+ func_addr = (unsigned long)func->old_func;
+ func_size = func->old_size;
} else {
- /*
- * Check for the to-be-patched function
- * (the previous func).
- */
- ops = klp_find_ops(func->old_func);
-
- if (list_is_singular(&ops->func_stack)) {
- /* original function */
- func_addr = (unsigned long)func->old_func;
- func_size = func->old_size;
- } else {
- /* previously patched function */
- struct klp_func *prev;
-
- prev = list_next_entry(func, stack_node);
- func_addr = (unsigned long)prev->new_func;
- func_size = prev->new_size;
- }
+ /* previously patched function */
+ struct klp_func *prev;
+
+ prev = list_next_entry(func, stack_node);
+ func_addr = (unsigned long)prev->new_func;
+ func_size = prev->new_size;
}
+ }
+
+ for (i = 0; i < nr_entries; i++) {
+ address = entries[i];
if (address >= func_addr && address < func_addr + func_size)
return -EAGAIN;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index ea925731fa40..0db4093d17b8 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -7,7 +7,6 @@ obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
KCSAN_SANITIZE_lockdep.o := n
-KMSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e3375bc40dad..50d4863974e7 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -55,6 +55,7 @@
#include <linux/rcupdate.h>
#include <linux/kprobes.h>
#include <linux/lockdep.h>
+#include <linux/context_tracking.h>
#include <asm/sections.h>
@@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
int dl = READ_ONCE(debug_locks);
+ bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */
pr_warn("\n");
@@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
lockdep_print_held_locks(curr);
pr_warn("\nstack backtrace:\n");
dump_stack();
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 9c2fb613a55d..f04b1978899d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -46,6 +46,9 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
torture_param(int, stat_interval, 60,
"Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(int, rt_boost, 2,
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
torture_param(int, verbose, 1,
"Enable verbose debugging printk()s");
@@ -127,15 +130,50 @@ static void torture_lock_busted_write_unlock(int tid __maybe_unused)
/* BUGGY, do not use in real life!!! */
}
-static void torture_boost_dummy(struct torture_random_state *trsp)
+static void __torture_rt_boost(struct torture_random_state *trsp)
{
- /* Only rtmutexes care about priority */
+ const unsigned int factor = rt_boost_factor;
+
+ if (!rt_task(current)) {
+ /*
+ * Boost priority once every rt_boost_factor operations. When
+ * the task tries to take the lock, the rtmutex it will account
+ * for the new priority, and do any corresponding pi-dance.
+ */
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
+ sched_set_fifo(current);
+ } else /* common case, do nothing */
+ return;
+ } else {
+ /*
+ * The task will remain boosted for another 10 * rt_boost_factor
+ * operations, then restored back to its original prio, and so
+ * forth.
+ *
+ * When @trsp is nil, we want to force-reset the task for
+ * stopping the kthread.
+ */
+ if (!trsp || !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor * 2))) {
+ sched_set_normal(current, 0);
+ } else /* common case, do nothing */
+ return;
+ }
+}
+
+static void torture_rt_boost(struct torture_random_state *trsp)
+{
+ if (rt_boost != 2)
+ return;
+
+ __torture_rt_boost(trsp);
}
static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_busted_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -179,7 +217,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_spin_lock_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -206,7 +244,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_spin_write_unlock_irq,
.readlock = NULL,
.read_delay = NULL,
@@ -275,7 +313,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_ops = {
.writelock = torture_rwlock_write_lock,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock,
.readlock = torture_rwlock_read_lock,
.read_delay = torture_rwlock_read_delay,
@@ -318,7 +356,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_irq_ops = {
.writelock = torture_rwlock_write_lock_irq,
.write_delay = torture_rwlock_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock_irq,
.readlock = torture_rwlock_read_lock_irq,
.read_delay = torture_rwlock_read_delay,
@@ -358,7 +396,7 @@ __releases(torture_mutex)
static struct lock_torture_ops mutex_lock_ops = {
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -456,7 +494,7 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
.exit = torture_ww_mutex_exit,
.writelock = torture_ww_mutex_lock,
.write_delay = torture_mutex_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_ww_mutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -474,37 +512,6 @@ __acquires(torture_rtmutex)
return 0;
}
-static void torture_rtmutex_boost(struct torture_random_state *trsp)
-{
- const unsigned int factor = 50000; /* yes, quite arbitrary */
-
- if (!rt_task(current)) {
- /*
- * Boost priority once every ~50k operations. When the
- * task tries to take the lock, the rtmutex it will account
- * for the new priority, and do any corresponding pi-dance.
- */
- if (trsp && !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor))) {
- sched_set_fifo(current);
- } else /* common case, do nothing */
- return;
- } else {
- /*
- * The task will remain boosted for another ~500k operations,
- * then restored back to its original prio, and so forth.
- *
- * When @trsp is nil, we want to force-reset the task for
- * stopping the kthread.
- */
- if (!trsp || !(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor * 2))) {
- sched_set_normal(current, 0);
- } else /* common case, do nothing */
- return;
- }
-}
-
static void torture_rtmutex_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
@@ -530,10 +537,18 @@ __releases(torture_rtmutex)
rt_mutex_unlock(&torture_rtmutex);
}
+static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
+{
+ if (!rt_boost)
+ return;
+
+ __torture_rt_boost(trsp);
+}
+
static struct lock_torture_ops rtmutex_lock_ops = {
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
- .task_boost = torture_rtmutex_boost,
+ .task_boost = torture_rt_boost_rtmutex,
.writeunlock = torture_rtmutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -600,7 +615,7 @@ __releases(torture_rwsem)
static struct lock_torture_ops rwsem_lock_ops = {
.writelock = torture_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwsem_up_write,
.readlock = torture_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -652,7 +667,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
.exit = torture_percpu_rwsem_exit,
.writelock = torture_percpu_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
- .task_boost = torture_boost_dummy,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_percpu_rwsem_up_write,
.readlock = torture_percpu_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2b23378775fe..ebe6b8ec7cb3 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -371,7 +371,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
/*
* We're pending, wait for the owner to go away.
*
- * 0,1,1 -> 0,1,0
+ * 0,1,1 -> *,1,0
*
* this wait loop must be a load-acquire such that we match the
* store-release that clears the locked bit and create lock
@@ -380,7 +380,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* barriers.
*/
if (val & _Q_LOCKED_MASK)
- atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->locked, !VAL);
/*
* take ownership and clear the pending bit.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7779ee8abc2a..728f434de2bb 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -89,15 +89,31 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
* set this bit before looking at the lock.
*/
-static __always_inline void
-rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+static __always_inline struct task_struct *
+rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
{
unsigned long val = (unsigned long)owner;
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
- WRITE_ONCE(lock->owner, (struct task_struct *)val);
+ return (struct task_struct *)val;
+}
+
+static __always_inline void
+rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+{
+ /*
+ * lock->wait_lock is held but explicit acquire semantics are needed
+ * for a new lock owner so WRITE_ONCE is insufficient.
+ */
+ xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner));
+}
+
+static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
+{
+ /* lock->wait_lock is held so the unlock provides release semantics. */
+ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
}
static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
@@ -106,7 +122,8 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
}
-static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock)
+static __always_inline void
+fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
@@ -172,8 +189,21 @@ static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock)
* still set.
*/
owner = READ_ONCE(*p);
- if (owner & RT_MUTEX_HAS_WAITERS)
- WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ if (owner & RT_MUTEX_HAS_WAITERS) {
+ /*
+ * See rt_mutex_set_owner() and rt_mutex_clear_owner() on
+ * why xchg_acquire() is used for updating owner for
+ * locking and WRITE_ONCE() for unlocking.
+ *
+ * WRITE_ONCE() would work for the acquire case too, but
+ * in case that the lock acquisition failed it might
+ * force other lockers into the slow path unnecessarily.
+ */
+ if (acquire_lock)
+ xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS);
+ else
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ }
}
/*
@@ -208,6 +238,13 @@ static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
owner = *p;
} while (cmpxchg_relaxed(p, owner,
owner | RT_MUTEX_HAS_WAITERS) != owner);
+
+ /*
+ * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE
+ * operations in the event of contention. Ensure the successful
+ * cmpxchg is visible.
+ */
+ smp_mb__after_atomic();
}
/*
@@ -864,8 +901,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_state(waiter->task, waiter->wake_state);
+ top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != top_waiter)
+ wake_up_state(top_waiter->task, top_waiter->wake_state);
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
@@ -1243,7 +1281,7 @@ static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
* try_to_take_rt_mutex() sets the lock waiters bit
* unconditionally. Clean this up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
return ret;
}
@@ -1604,7 +1642,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
* try_to_take_rt_mutex() sets the waiter bit
* unconditionally. We might have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
trace_contention_end(lock, ret);
@@ -1719,7 +1757,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
* try_to_take_rt_mutex() sets the waiter bit unconditionally.
* We might have to fix that up:
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
debug_rt_mutex_free_waiter(&waiter);
trace_contention_end(lock, 0);
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 900220941caa..cb9fdff76a8a 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -267,7 +267,7 @@ void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
{
debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL);
+ rt_mutex_clear_owner(lock);
}
/**
@@ -382,7 +382,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, true);
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -438,7 +438,7 @@ bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
+ fixup_rt_mutex_waiters(lock, false);
raw_spin_unlock_irq(&lock->wait_lock);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 44873594de03..acb5a50309a1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -256,16 +256,13 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
{
long tmp = RWSEM_UNLOCKED_VALUE;
- bool ret = false;
- preempt_disable();
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
rwsem_set_owner(sem);
- ret = true;
+ return true;
}
- preempt_enable();
- return ret;
+ return false;
}
/*
@@ -624,18 +621,16 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
*/
if (first->handoff_set && (waiter != first))
return false;
-
- /*
- * First waiter can inherit a previously set handoff
- * bit and spin on rwsem if lock acquisition fails.
- */
- if (waiter == first)
- waiter->handoff_set = true;
}
new = count;
if (count & RWSEM_LOCK_MASK) {
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
if (has_handoff || (!rt_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
@@ -651,11 +646,12 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
/*
- * We have either acquired the lock with handoff bit cleared or
- * set the handoff bit.
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
*/
if (new & RWSEM_FLAG_HANDOFF) {
- waiter->handoff_set = true;
+ first->handoff_set = true;
lockevent_inc(rwsem_wlock_handoff);
return false;
}
@@ -717,7 +713,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
return false;
}
- preempt_disable();
/*
* Disable preemption is equal to the RCU read-side crital section,
* thus the task_strcut structure won't go away.
@@ -729,7 +724,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
if ((flags & RWSEM_NONSPINNABLE) ||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !ret);
return ret;
@@ -829,8 +823,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
int loop = 0;
u64 rspin_threshold = 0;
- preempt_disable();
-
/* sem->wait_lock should not be held when doing optimistic spinning */
if (!osq_lock(&sem->osq))
goto done;
@@ -938,7 +930,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
}
osq_unlock(&sem->osq);
done:
- preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
}
@@ -1092,7 +1083,7 @@ queue:
/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
break;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_reader);
}
@@ -1179,15 +1170,12 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
if (waiter.handoff_set) {
enum owner_state owner_state;
- preempt_disable();
owner_state = rwsem_spin_on_owner(sem);
- preempt_enable();
-
if (owner_state == OWNER_NULL)
goto trylock_again;
}
- schedule();
+ schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
trylock_again:
@@ -1254,14 +1242,20 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
*/
static inline int __down_read_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
long count;
+ preempt_disable();
if (!rwsem_read_trylock(sem, &count)) {
- if (IS_ERR(rwsem_down_read_slowpath(sem, count, state)))
- return -EINTR;
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
+ ret = -EINTR;
+ goto out;
+ }
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
}
- return 0;
+out:
+ preempt_enable();
+ return ret;
}
static inline void __down_read(struct rw_semaphore *sem)
@@ -1281,19 +1275,23 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ int ret = 0;
long tmp;
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ preempt_disable();
tmp = atomic_long_read(&sem->count);
while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
- return 1;
+ ret = 1;
+ break;
}
}
- return 0;
+ preempt_enable();
+ return ret;
}
/*
@@ -1301,12 +1299,15 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
*/
static inline int __down_write_common(struct rw_semaphore *sem, int state)
{
+ int ret = 0;
+
+ preempt_disable();
if (unlikely(!rwsem_write_trylock(sem))) {
if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
- return -EINTR;
+ ret = -EINTR;
}
-
- return 0;
+ preempt_enable();
+ return ret;
}
static inline void __down_write(struct rw_semaphore *sem)
@@ -1321,8 +1322,14 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
+ int ret;
+
+ preempt_disable();
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- return rwsem_write_trylock(sem);
+ ret = rwsem_write_trylock(sem);
+ preempt_enable();
+
+ return ret;
}
/*
@@ -1335,6 +1342,7 @@ static inline void __up_read(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ preempt_disable();
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
@@ -1343,6 +1351,7 @@ static inline void __up_read(struct rw_semaphore *sem)
clear_nonspinnable(sem);
rwsem_wake(sem);
}
+ preempt_enable();
}
/*
@@ -1363,9 +1372,9 @@ static inline void __up_write(struct rw_semaphore *sem)
preempt_disable();
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
- preempt_enable();
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
rwsem_wake(sem);
+ preempt_enable();
}
/*
@@ -1383,11 +1392,13 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
* write side. As such, rely on RELEASE semantics.
*/
DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ preempt_disable();
tmp = atomic_long_fetch_add_release(
-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
rwsem_set_reader_owned(sem);
if (tmp & RWSEM_FLAG_WAITERS)
rwsem_downgrade_wake(sem);
+ preempt_enable();
}
#else /* !CONFIG_PREEMPT_RT */
@@ -1662,6 +1673,12 @@ void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
__down_read(sem);
+ /*
+ * The owner value for a reader-owned lock is mostly for debugging
+ * purpose only and is not critical to the correct functioning of
+ * rwsem. So it is perfectly fine to set it in a preempt-enabled
+ * context here.
+ */
__rwsem_set_reader_owned(sem, NULL);
}
EXPORT_SYMBOL(down_read_non_owner);
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 43efb2a04160..29dc253d03af 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -399,7 +399,7 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
- r = prandom_u32_max(n + 1);
+ r = get_random_u32_below(n + 1);
if (r != n) {
tmp = order[n];
order[n] = order[r];
@@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work)
{
struct stress *stress = container_of(work, typeof(*stress), work);
const int nlocks = stress->nlocks;
- struct ww_mutex *lock = stress->locks + prandom_u32_max(nlocks);
+ struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks);
int err;
do {
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 26ea5d04f56c..424b3bc58f3f 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -221,9 +221,10 @@ endchoice
config MODULE_DECOMPRESS
bool "Support in-kernel module decompression"
- depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ
+ depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
select XZ_DEC if MODULE_COMPRESS_XZ
+ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD
help
Support for decompressing kernel modules by the kernel itself
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index c033572d83f0..bb79ac1a6d8f 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -50,7 +50,7 @@ static struct page *module_get_next_page(struct load_info *info)
return page;
}
-#ifdef CONFIG_MODULE_COMPRESS_GZIP
+#if defined(CONFIG_MODULE_COMPRESS_GZIP)
#include <linux/zlib.h>
#define MODULE_COMPRESSION gzip
#define MODULE_DECOMPRESS_FN module_gzip_decompress
@@ -114,8 +114,8 @@ static ssize_t module_gzip_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
- if (!page) {
- retval = -ENOMEM;
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
goto out_inflate_end;
}
@@ -141,7 +141,7 @@ out:
kfree(s.workspace);
return retval;
}
-#elif CONFIG_MODULE_COMPRESS_XZ
+#elif defined(CONFIG_MODULE_COMPRESS_XZ)
#include <linux/xz.h>
#define MODULE_COMPRESSION xz
#define MODULE_DECOMPRESS_FN module_xz_decompress
@@ -173,8 +173,8 @@ static ssize_t module_xz_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
- if (!page) {
- retval = -ENOMEM;
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
goto out;
}
@@ -199,6 +199,94 @@ static ssize_t module_xz_decompress(struct load_info *info,
xz_dec_end(xz_dec);
return retval;
}
+#elif defined(CONFIG_MODULE_COMPRESS_ZSTD)
+#include <linux/zstd.h>
+#define MODULE_COMPRESSION zstd
+#define MODULE_DECOMPRESS_FN module_zstd_decompress
+
+static ssize_t module_zstd_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd };
+ ZSTD_outBuffer zstd_dec;
+ ZSTD_inBuffer zstd_buf;
+ zstd_frame_header header;
+ size_t wksp_size;
+ void *wksp = NULL;
+ ZSTD_DStream *dstream;
+ size_t ret;
+ size_t new_size = 0;
+ int retval;
+
+ if (size < sizeof(signature) ||
+ memcmp(buf, signature, sizeof(signature))) {
+ pr_err("not a zstd compressed module\n");
+ return -EINVAL;
+ }
+
+ zstd_buf.src = buf;
+ zstd_buf.pos = 0;
+ zstd_buf.size = size;
+
+ ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size);
+ if (ret != 0) {
+ pr_err("ZSTD-compressed data has an incomplete frame header\n");
+ retval = -EINVAL;
+ goto out;
+ }
+ if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) {
+ pr_err("ZSTD-compressed data has too large a window size\n");
+ retval = -EINVAL;
+ goto out;
+ }
+
+ wksp_size = zstd_dstream_workspace_bound(header.windowSize);
+ wksp = kmalloc(wksp_size, GFP_KERNEL);
+ if (!wksp) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size);
+ if (!dstream) {
+ pr_err("Can't initialize ZSTD stream\n");
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (!IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out;
+ }
+
+ zstd_dec.dst = kmap_local_page(page);
+ zstd_dec.pos = 0;
+ zstd_dec.size = PAGE_SIZE;
+
+ ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf);
+ kunmap(page);
+ retval = zstd_get_error_code(ret);
+ if (retval)
+ break;
+
+ new_size += zstd_dec.pos;
+ } while (zstd_dec.pos == PAGE_SIZE && ret != 0);
+
+ if (retval) {
+ pr_err("ZSTD-decompression failed with status %d\n", retval);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = new_size;
+
+ out:
+ kfree(wksp);
+ return retval;
+}
#else
#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS"
#endif
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index f5c5c9175333..ab2376a1be88 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -494,8 +494,8 @@ unsigned long module_kallsyms_lookup_name(const char *name)
return ret;
}
-#ifdef CONFIG_LIVEPATCH
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+int module_kallsyms_on_each_symbol(const char *modname,
+ int (*fn)(void *, const char *,
struct module *, unsigned long),
void *data)
{
@@ -510,6 +510,9 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
if (mod->state == MODULE_STATE_UNFORMED)
continue;
+ if (modname && strcmp(modname, mod->name))
+ continue;
+
/* Use rcu_dereference_sched() to remain compliant with the sparse tool */
preempt_disable();
kallsyms = rcu_dereference_sched(mod->kallsyms);
@@ -526,9 +529,15 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
if (ret != 0)
goto out;
}
+
+ /*
+ * The given module is found, the subsequent modules do not
+ * need to be compared.
+ */
+ if (modname)
+ break;
}
out:
mutex_unlock(&module_mutex);
return ret;
}
-#endif /* CONFIG_LIVEPATCH */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d02d39c7174e..4ac3fe43e6c8 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -85,9 +85,6 @@ struct mod_tree_root mod_data_tree __cacheline_aligned = {
};
#endif
-#define module_addr_min mod_tree.addr_min
-#define module_addr_max mod_tree.addr_max
-
struct symsearch {
const struct kernel_symbol *start, *stop;
const s32 *crcs;
@@ -1674,6 +1671,11 @@ static int elf_validity_check(struct load_info *info)
info->hdr->e_machine);
goto no_exec;
}
+ if (!module_elf_check_arch(info->hdr)) {
+ pr_err("Invalid module architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ goto no_exec;
+ }
if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
pr_err("Invalid ELF section header size\n");
goto no_exec;
@@ -2247,6 +2249,11 @@ static void flush_module_icache(const struct module *mod)
(unsigned long)mod->core_layout.base + mod->core_layout.size);
}
+bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
+{
+ return true;
+}
+
int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
char *secstrings,
@@ -2386,7 +2393,8 @@ static bool finished_loading(const char *name)
sched_annotate_sleep();
mutex_lock(&module_mutex);
mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE;
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
mutex_unlock(&module_mutex);
return ret;
@@ -2562,20 +2570,35 @@ static int add_unformed_module(struct module *mod)
mod->state = MODULE_STATE_UNFORMED;
-again:
mutex_lock(&module_mutex);
old = find_module_all(mod->name, strlen(mod->name), true);
if (old != NULL) {
- if (old->state != MODULE_STATE_LIVE) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
err = wait_event_interruptible(module_wq,
finished_loading(mod->name));
if (err)
goto out_unlocked;
- goto again;
+
+ /* The module might have gone in the meantime. */
+ mutex_lock(&module_mutex);
+ old = find_module_all(mod->name, strlen(mod->name),
+ true);
}
- err = -EEXIST;
+
+ /*
+ * We are here only when the same module was being loaded. Do
+ * not try to load it again right now. It prevents long delays
+ * caused by serialized module load failures. It might happen
+ * when more devices of the same type trigger load of
+ * a particular module.
+ */
+ if (old && old->state == MODULE_STATE_LIVE)
+ err = -EEXIST;
+ else
+ err = -EBUSY;
goto out;
}
mod_update_bounds(mod);
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index ce68f821dcd1..c921bf044050 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -340,7 +340,7 @@ static int mod_sysfs_init(struct module *mod)
int err;
struct kobject *kobj;
- if (!module_sysfs_initialized) {
+ if (!module_kset) {
pr_err("%s: module sysfs not initialized\n", mod->name);
err = -EINVAL;
goto out;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 0d5bd62c480e..d353e4b5402d 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -62,7 +62,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* value of this parameter is -1.
* @nr_calls: Records the number of notifications sent. Don't care
* value of this field is NULL.
- * @returns: notifier_call_chain returns the value returned by the
+ * Return: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int notifier_call_chain(struct notifier_block **nl,
@@ -105,13 +105,13 @@ NOKPROBE_SYMBOL(notifier_call_chain);
* @val_up: Value passed unmodified to the notifier function
* @val_down: Value passed unmodified to the notifier function when recovering
* from an error on @val_up
- * @v Pointer passed unmodified to the notifier function
+ * @v: Pointer passed unmodified to the notifier function
*
* NOTE: It is important the @nl chain doesn't change between the two
* invocations of notifier_call_chain() such that we visit the
* exact same notifier callbacks; this rules out any RCU usage.
*
- * Returns: the return value of the @val_up call.
+ * Return: the return value of the @val_up call.
*/
static int notifier_call_chain_robust(struct notifier_block **nl,
unsigned long val_up, unsigned long val_down,
@@ -456,7 +456,6 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
-#ifdef CONFIG_SRCU
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -573,8 +572,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
-#endif /* CONFIG_SRCU */
-
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/padata.c b/kernel/padata.c
index e5819bb8bd1d..e007b8a4b738 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -83,8 +83,16 @@ static struct padata_work *padata_work_alloc(void)
return pw;
}
-static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
- void *data, int flags)
+/*
+ * This function is marked __ref because this function may be optimized in such
+ * a way that it directly refers to work_fn's address, which causes modpost to
+ * complain when work_fn is marked __init. This scenario was observed with clang
+ * LTO, where padata_work_init() was optimized to refer directly to
+ * padata_mt_helper() because the calls to padata_work_init() with other work_fn
+ * values were eliminated or inlined.
+ */
+static void __ref padata_work_init(struct padata_work *pw, work_func_t work_fn,
+ void *data, int flags)
{
if (flags & PADATA_WORK_ONSTACK)
INIT_WORK_ONSTACK(&pw->pw_work, work_fn);
@@ -207,14 +215,16 @@ int padata_do_parallel(struct padata_shell *ps,
pw = padata_work_alloc();
spin_unlock(&padata_works_lock);
+ if (!pw) {
+ /* Maximum works limit exceeded, run in the current task. */
+ padata->parallel(padata);
+ }
+
rcu_read_unlock_bh();
if (pw) {
padata_work_init(pw, padata_parallel_worker, padata, 0);
queue_work(pinst->parallel_wq, &pw->pw_work);
- } else {
- /* Maximum works limit exceeded, run in the current task. */
- padata->parallel(padata);
}
return 0;
@@ -388,13 +398,16 @@ void padata_do_serial(struct padata_priv *padata)
int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
struct padata_priv *cur;
+ struct list_head *pos;
spin_lock(&reorder->lock);
/* Sort in ascending order of sequence number. */
- list_for_each_entry_reverse(cur, &reorder->list, list)
+ list_for_each_prev(pos, &reorder->list) {
+ cur = list_entry(pos, struct padata_priv, list);
if (cur->seq_nr < padata->seq_nr)
break;
- list_add(&padata->list, &cur->list);
+ }
+ list_add(&padata->list, pos);
spin_unlock(&reorder->lock);
/*
diff --git a/kernel/panic.c b/kernel/panic.c
index da323209f583..487f5b03bf83 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -25,6 +25,7 @@
#include <linux/kexec.h>
#include <linux/panic_notifier.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
@@ -32,6 +33,8 @@
#include <linux/bug.h>
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
+#include <linux/sysfs.h>
+#include <linux/context_tracking.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>
@@ -58,6 +61,7 @@ bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
+static unsigned int warn_limit __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -75,8 +79,9 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
-#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL)
+#ifdef CONFIG_SYSCTL
static struct ctl_table kern_panic_table[] = {
+#ifdef CONFIG_SMP
{
.procname = "oops_all_cpu_backtrace",
.data = &sysctl_oops_all_cpu_backtrace,
@@ -86,6 +91,14 @@ static struct ctl_table kern_panic_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+#endif
+ {
+ .procname = "warn_limit",
+ .data = &warn_limit,
+ .maxlen = sizeof(warn_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
{ }
};
@@ -97,6 +110,25 @@ static __init int kernel_panic_sysctls_init(void)
late_initcall(kernel_panic_sysctls_init);
#endif
+static atomic_t warn_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&warn_count));
+}
+
+static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count);
+
+static __init int kernel_panic_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_panic_sysfs_init);
+#endif
+
static long no_blink(int state)
{
return 0;
@@ -199,6 +231,19 @@ static void panic_print_sys_info(bool console_flush)
ftrace_dump(DUMP_ALL);
}
+void check_panic_on_warn(const char *origin)
+{
+ unsigned int limit;
+
+ if (panic_on_warn)
+ panic("%s: panic_on_warn set ...\n", origin);
+
+ limit = READ_ONCE(warn_limit);
+ if (atomic_inc_return(&warn_count) >= limit && limit)
+ panic("%s: system warned too often (kernel.warn_limit is %d)",
+ origin, limit);
+}
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -617,8 +662,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (regs)
show_regs(regs);
- if (panic_on_warn)
- panic("panic_on_warn set ...\n");
+ check_panic_on_warn("kernel");
if (!regs)
dump_stack();
@@ -636,6 +680,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
void warn_slowpath_fmt(const char *file, int line, unsigned taint,
const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
struct warn_args args;
pr_warn(CUT_HERE);
@@ -650,11 +695,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
+ bool rcu = warn_rcu_enter();
va_list args;
pr_warn(CUT_HERE);
@@ -662,6 +709,7 @@ void __warn_printk(const char *fmt, ...)
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(__warn_printk);
#endif
@@ -744,8 +792,8 @@ static int __init panic_on_taint_setup(char *s)
if (s && !strcmp(s, "nousertaint"))
panic_on_taint_nousertaint = true;
- pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n",
- panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis");
+ pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n",
+ panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint));
return 0;
}
diff --git a/kernel/params.c b/kernel/params.c
index 5b92310425c5..14d66070757b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -926,7 +926,7 @@ static const struct sysfs_ops module_sysfs_ops = {
.store = module_attr_store,
};
-static int uevent_filter(struct kobject *kobj)
+static int uevent_filter(const struct kobject *kobj)
{
const struct kobj_type *ktype = get_ktype(kobj);
@@ -940,7 +940,6 @@ static const struct kset_uevent_ops module_uevent_ops = {
};
struct kset *module_kset;
-int module_sysfs_initialized;
static void module_kobj_release(struct kobject *kobj)
{
@@ -954,7 +953,11 @@ struct kobj_type module_ktype = {
};
/*
- * param_sysfs_init - wrapper for built-in params support
+ * param_sysfs_init - create "module" kset
+ *
+ * This must be done before the initramfs is unpacked and
+ * request_module() thus becomes possible, because otherwise the
+ * module load would fail in mod_sysfs_init.
*/
static int __init param_sysfs_init(void)
{
@@ -964,13 +967,25 @@ static int __init param_sysfs_init(void)
__FILE__, __LINE__);
return -ENOMEM;
}
- module_sysfs_initialized = 1;
+
+ return 0;
+}
+subsys_initcall(param_sysfs_init);
+
+/*
+ * param_sysfs_builtin_init - add sysfs version and parameter
+ * attributes for built-in modules
+ */
+static int __init param_sysfs_builtin_init(void)
+{
+ if (!module_kset)
+ return -ENOMEM;
version_sysfs_builtin();
param_sysfs_builtin();
return 0;
}
-subsys_initcall(param_sysfs_init);
+late_initcall(param_sysfs_builtin_init);
#endif /* CONFIG_SYSFS */
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index f4f8cb0435b4..fc21c5d5fd5d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -244,7 +244,24 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->pid_allocated == init_pids)
break;
+ /*
+ * Release tasks_rcu_exit_srcu to avoid following deadlock:
+ *
+ * 1) TASK A unshare(CLONE_NEWPID)
+ * 2) TASK A fork() twice -> TASK B (child reaper for new ns)
+ * and TASK C
+ * 3) TASK B exits, kills TASK C, waits for TASK A to reap it
+ * 4) TASK A calls synchronize_rcu_tasks()
+ * -> synchronize_srcu(tasks_rcu_exit_srcu)
+ * 5) *DEADLOCK*
+ *
+ * It is considered safe to release tasks_rcu_exit_srcu here
+ * because we assume the current task can not be concurrently
+ * reaped at this point.
+ */
+ exit_tasks_rcu_stop();
schedule();
+ exit_tasks_rcu_start();
}
__set_current_state(TASK_RUNNING);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 60a1d3051cc7..4b31629c5be4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -118,7 +118,6 @@ config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
select PM
- select SRCU
config PM_SLEEP_SMP
def_bool y
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index f82111837b8d..7b44f5b89fa1 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -87,10 +87,7 @@ static void em_debug_create_pd(struct device *dev)
static void em_debug_remove_pd(struct device *dev)
{
- struct dentry *debug_dir;
-
- debug_dir = debugfs_lookup(dev_name(dev), rootdir);
- debugfs_remove_recursive(debug_dir);
+ debugfs_lookup_and_remove(dev_name(dev), rootdir);
}
static int __init em_debug_init(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ddd9988327fe..6c1c7e566d35 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -27,6 +27,8 @@ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
+ const char *what = user_only ? "user space processes" :
+ "remaining freezable tasks";
struct task_struct *g, *p;
unsigned long end_time;
unsigned int todo;
@@ -36,6 +38,8 @@ static int try_to_freeze_tasks(bool user_only)
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+ pr_info("Freezing %s\n", what);
+
start = ktime_get_boottime();
end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
@@ -82,9 +86,8 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs = ktime_to_ms(elapsed);
if (todo) {
- pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
+ pr_err("Freezing %s %s after %d.%03d seconds "
+ "(%d tasks refusing to freeze, wq_busy=%d):\n", what,
wakeup ? "aborted" : "failed",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
@@ -101,8 +104,8 @@ static int try_to_freeze_tasks(bool user_only)
read_unlock(&tasklist_lock);
}
} else {
- pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
- elapsed_msecs % 1000);
+ pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n",
+ what, elapsed_msecs / 1000, elapsed_msecs % 1000);
}
return todo ? -EBUSY : 0;
@@ -130,14 +133,11 @@ int freeze_processes(void)
static_branch_inc(&freezer_active);
pm_wakeup_clear(0);
- pr_info("Freezing user space processes ... ");
pm_freezing = true;
error = try_to_freeze_tasks(true);
- if (!error) {
+ if (!error)
__usermodehelper_set_disable_depth(UMH_DISABLED);
- pr_cont("done.");
- }
- pr_cont("\n");
+
BUG_ON(in_atomic());
/*
@@ -166,14 +166,9 @@ int freeze_kernel_threads(void)
{
int error;
- pr_info("Freezing remaining freezable tasks ... ");
-
pm_nosig_freezing = true;
error = try_to_freeze_tasks(false);
- if (!error)
- pr_cont("done.");
- pr_cont("\n");
BUG_ON(in_atomic());
if (error)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 2a406753af90..cd8b7b35f1e8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1723,8 +1723,8 @@ static unsigned long minimum_image_size(unsigned long saveable)
* /sys/power/reserved_size, respectively). To make this happen, we compute the
* total number of available page frames and allocate at least
*
- * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
- * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
+ * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2
+ * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
*
* of them, which corresponds to the maximum size of a hibernation image.
*
@@ -2259,10 +2259,14 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
- if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
+ if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) {
memory_bm_set_bit(bm, buf[j]);
- else
+ } else {
+ if (!pfn_valid(buf[j]))
+ pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n",
+ (unsigned long long)PFN_PHYS(buf[j]));
return -EFAULT;
+ }
}
return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 277434b6c0bf..36a1df48280c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -581,7 +581,7 @@ static int save_image(struct swap_map_handle *handle,
return ret;
}
-/**
+/*
* Structure used for CRC32.
*/
struct crc_data {
@@ -596,7 +596,7 @@ struct crc_data {
unsigned char *unc[LZO_THREADS]; /* uncompressed data */
};
-/**
+/*
* CRC32 update function that runs in its own thread.
*/
static int crc32_threadfn(void *data)
@@ -623,7 +623,7 @@ static int crc32_threadfn(void *data)
}
return 0;
}
-/**
+/*
* Structure used for LZO data compression.
*/
struct cmp_data {
@@ -640,7 +640,7 @@ struct cmp_data {
unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
};
-/**
+/*
* Compression function that runs in its own thread.
*/
static int lzo_compress_threadfn(void *data)
@@ -948,9 +948,9 @@ out_finish:
return error;
}
-/**
+/*
* The following functions allow us to read data using a swap map
- * in a file-alike way
+ * in a file-like way.
*/
static void release_swap_reader(struct swap_map_handle *handle)
@@ -1107,7 +1107,7 @@ static int load_image(struct swap_map_handle *handle,
return ret;
}
-/**
+/*
* Structure used for LZO data decompression.
*/
struct dec_data {
@@ -1123,7 +1123,7 @@ struct dec_data {
unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
};
-/**
+/*
* Decompression function that runs in its own thread.
*/
static int lzo_decompress_threadfn(void *data)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a9b7dade0d2e..fd0c9f913940 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -123,6 +123,7 @@ bool console_srcu_read_lock_is_held(void)
{
return srcu_read_lock_held(&console_srcu);
}
+EXPORT_SYMBOL(console_srcu_read_lock_is_held);
#endif
enum devkmsg_log_bits {
@@ -1869,6 +1870,7 @@ static void console_lock_spinning_enable(void)
/**
* console_lock_spinning_disable_and_check - mark end of code where another
* thread was able to busy wait and check if there is a waiter
+ * @cookie: cookie returned from console_srcu_read_lock()
*
* This is called at the end of the section where spinning is allowed.
* It has two functions. First, it is a signal that it is no longer
@@ -2151,7 +2153,7 @@ static u16 printk_sprint(char *text, u16 size, int facility,
}
}
- trace_console_rcuidle(text, text_len);
+ trace_console(text, text_len);
return text_len;
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54482193e1ed..0786450074c1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -813,7 +813,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
{
struct ptrace_rseq_configuration conf = {
.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
- .rseq_abi_size = sizeof(*task->rseq),
+ .rseq_abi_size = task->rseq_len,
.signature = task->rseq_sig,
.flags = 0,
};
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 232e29fe3e5e..2984de629f74 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -82,7 +82,7 @@ config RCU_CPU_STALL_TIMEOUT
config RCU_EXP_CPU_STALL_TIMEOUT
int "Expedited RCU CPU stall timeout in milliseconds"
depends on RCU_STALL_COMMON
- range 0 21000
+ range 0 300000
default 0
help
If a given expedited RCU grace period extends more than the
@@ -92,6 +92,19 @@ config RCU_EXP_CPU_STALL_TIMEOUT
says to use the RCU_CPU_STALL_TIMEOUT value converted from
seconds to milliseconds.
+config RCU_CPU_STALL_CPUTIME
+ bool "Provide additional RCU stall debug information"
+ depends on RCU_STALL_COMMON
+ default n
+ help
+ Collect statistics during the sampling period, such as the number of
+ (hard interrupts, soft interrupts, task switches) and the cputime of
+ (hard interrupts, soft interrupts, kernel tasks) are added to the
+ RCU stall report. For multiple continuous RCU stalls, all sampling
+ periods begin at half of the first RCU stall timeout.
+ The boot option rcupdate.rcu_cpu_stall_cputime has the same function
+ as this one, but will override this if it exists.
+
config RCU_TRACE
bool "Enable tracing for RCU"
depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index c5aa934de59b..115616ac3bfa 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -224,6 +224,8 @@ extern int rcu_cpu_stall_ftrace_dump;
extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
extern int rcu_exp_cpu_stall_timeout;
+extern int rcu_cpu_stall_cputime;
+extern bool rcu_exp_stall_task_details __read_mostly;
int rcu_jiffies_till_stall_check(void);
int rcu_exp_jiffies_till_stall_check(void);
@@ -447,14 +449,20 @@ do { \
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
static inline bool rcu_gp_is_normal(void) { return true; }
static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline bool rcu_async_should_hurry(void) { return false; }
static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
+static inline void rcu_async_hurry(void) { }
+static inline void rcu_async_relax(void) { }
static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
+bool rcu_async_should_hurry(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
+void rcu_async_hurry(void);
+void rcu_async_relax(void);
void rcupdate_announce_bootup_oddness(void);
#ifdef CONFIG_TASKS_RCU_GENERIC
void show_rcu_tasks_gp_kthreads(void);
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index c54ea2b6a36b..f71fac422c8f 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -89,7 +89,7 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
}
/* Get the length of a segment of the rcu_segcblist structure. */
-static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
{
return READ_ONCE(rsclp->seglen[seg]);
}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 431cee212467..4fe877f5f654 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -15,6 +15,8 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
return READ_ONCE(rclp->len);
}
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg);
+
/* Return number of callbacks in segmented callback list by summing seglen. */
long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 634df26a2c27..8e6c023212cb 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -399,7 +399,7 @@ static int torture_readlock_not_held(void)
return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
}
-static int rcu_torture_read_lock(void) __acquires(RCU)
+static int rcu_torture_read_lock(void)
{
rcu_read_lock();
return 0;
@@ -441,7 +441,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
}
}
-static void rcu_torture_read_unlock(int idx) __releases(RCU)
+static void rcu_torture_read_unlock(int idx)
{
rcu_read_unlock();
}
@@ -625,7 +625,7 @@ static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
static struct rcu_torture_ops srcud_ops;
-static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
+static int srcu_torture_read_lock(void)
{
if (cur_ops == &srcud_ops)
return srcu_read_lock_nmisafe(srcu_ctlp);
@@ -652,7 +652,7 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
}
}
-static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
+static void srcu_torture_read_unlock(int idx)
{
if (cur_ops == &srcud_ops)
srcu_read_unlock_nmisafe(srcu_ctlp, idx);
@@ -814,13 +814,13 @@ static void synchronize_rcu_trivial(void)
}
}
-static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
+static int rcu_torture_read_lock_trivial(void)
{
preempt_disable();
return 0;
}
-static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
+static void rcu_torture_read_unlock_trivial(int idx)
{
preempt_enable();
}
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 435c884c02b5..afa3e1a2f690 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -76,6 +76,8 @@ torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");
// Wait until there are multiple CPUs before starting test.
torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
"Holdoff time before test start (s)");
+// Number of typesafe_lookup structures, that is, the degree of concurrency.
+torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");
// Number of loops per experiment, all readers execute operations concurrently.
torture_param(long, loops, 10000, "Number of loops per experiment.");
// Number of readers, with -1 defaulting to about 75% of the CPUs.
@@ -124,7 +126,7 @@ static int exp_idx;
// Operations vector for selecting different types of tests.
struct ref_scale_ops {
- void (*init)(void);
+ bool (*init)(void);
void (*cleanup)(void);
void (*readsection)(const int nloops);
void (*delaysection)(const int nloops, const int udl, const int ndl);
@@ -162,8 +164,9 @@ static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl
}
}
-static void rcu_sync_scale_init(void)
+static bool rcu_sync_scale_init(void)
{
+ return true;
}
static struct ref_scale_ops rcu_ops = {
@@ -315,9 +318,10 @@ static struct ref_scale_ops refcnt_ops = {
// Definitions for rwlock
static rwlock_t test_rwlock;
-static void ref_rwlock_init(void)
+static bool ref_rwlock_init(void)
{
rwlock_init(&test_rwlock);
+ return true;
}
static void ref_rwlock_section(const int nloops)
@@ -351,9 +355,10 @@ static struct ref_scale_ops rwlock_ops = {
// Definitions for rwsem
static struct rw_semaphore test_rwsem;
-static void ref_rwsem_init(void)
+static bool ref_rwsem_init(void)
{
init_rwsem(&test_rwsem);
+ return true;
}
static void ref_rwsem_section(const int nloops)
@@ -523,6 +528,237 @@ static struct ref_scale_ops clock_ops = {
.name = "clock"
};
+////////////////////////////////////////////////////////////////////////
+//
+// Methods leveraging SLAB_TYPESAFE_BY_RCU.
+//
+
+// Item to look up in a typesafe manner. Array of pointers to these.
+struct refscale_typesafe {
+ atomic_t rts_refctr; // Used by all flavors
+ spinlock_t rts_lock;
+ seqlock_t rts_seqlock;
+ unsigned int a;
+ unsigned int b;
+};
+
+static struct kmem_cache *typesafe_kmem_cachep;
+static struct refscale_typesafe **rtsarray;
+static long rtsarray_size;
+static DEFINE_TORTURE_RANDOM_PERCPU(refscale_rand);
+static bool (*rts_acquire)(struct refscale_typesafe *rtsp, unsigned int *start);
+static bool (*rts_release)(struct refscale_typesafe *rtsp, unsigned int start);
+
+// Conditionally acquire an explicit in-structure reference count.
+static bool typesafe_ref_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ return atomic_inc_not_zero(&rtsp->rts_refctr);
+}
+
+// Unconditionally release an explicit in-structure reference count.
+static bool typesafe_ref_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ if (!atomic_dec_return(&rtsp->rts_refctr)) {
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ kmem_cache_free(typesafe_kmem_cachep, rtsp);
+ }
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure spinlock.
+static bool typesafe_lock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ spin_lock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally release an explicit in-structure spinlock.
+static bool typesafe_lock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ spin_unlock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure sequence lock.
+static bool typesafe_seqlock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ *start = read_seqbegin(&rtsp->rts_seqlock);
+ return true;
+}
+
+// Conditionally release an explicit in-structure sequence lock. Return
+// true if this release was successful, that is, if no retry is required.
+static bool typesafe_seqlock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ return !read_seqretry(&rtsp->rts_seqlock, start);
+}
+
+// Do a read-side critical section with the specified delay in
+// microseconds and nanoseconds inserted so as to increase probability
+// of failure.
+static void typesafe_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned int a;
+ unsigned int b;
+ int i;
+ long idx;
+ struct refscale_typesafe *rtsp;
+ unsigned int start;
+
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ idx = torture_random(this_cpu_ptr(&refscale_rand)) % rtsarray_size;
+ preempt_enable();
+retry:
+ rcu_read_lock();
+ rtsp = rcu_dereference(rtsarray[idx]);
+ a = READ_ONCE(rtsp->a);
+ if (!rts_acquire(rtsp, &start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ if (a != READ_ONCE(rtsp->a)) {
+ (void)rts_release(rtsp, start);
+ rcu_read_unlock();
+ goto retry;
+ }
+ un_delay(udl, ndl);
+ // Remember, seqlock read-side release can fail.
+ if (!rts_release(rtsp, start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ b = READ_ONCE(rtsp->a);
+ WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b);
+ b = rtsp->b;
+ rcu_read_unlock();
+ WARN_ON_ONCE(a * a != b);
+ }
+}
+
+// Because the acquisition and release methods are expensive, there
+// is no point in optimizing away the un_delay() function's two checks.
+// Thus simply define typesafe_read_section() as a simple wrapper around
+// typesafe_delay_section().
+static void typesafe_read_section(const int nloops)
+{
+ typesafe_delay_section(nloops, 0, 0);
+}
+
+// Allocate and initialize one refscale_typesafe structure.
+static struct refscale_typesafe *typesafe_alloc_one(void)
+{
+ struct refscale_typesafe *rtsp;
+
+ rtsp = kmem_cache_alloc(typesafe_kmem_cachep, GFP_KERNEL);
+ if (!rtsp)
+ return NULL;
+ atomic_set(&rtsp->rts_refctr, 1);
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ WRITE_ONCE(rtsp->b, rtsp->a * rtsp->a);
+ return rtsp;
+}
+
+// Slab-allocator constructor for refscale_typesafe structures created
+// out of a new slab of system memory.
+static void refscale_typesafe_ctor(void *rtsp_in)
+{
+ struct refscale_typesafe *rtsp = rtsp_in;
+
+ spin_lock_init(&rtsp->rts_lock);
+ seqlock_init(&rtsp->rts_seqlock);
+ preempt_disable();
+ rtsp->a = torture_random(this_cpu_ptr(&refscale_rand));
+ preempt_enable();
+}
+
+static struct ref_scale_ops typesafe_ref_ops;
+static struct ref_scale_ops typesafe_lock_ops;
+static struct ref_scale_ops typesafe_seqlock_ops;
+
+// Initialize for a typesafe test.
+static bool typesafe_init(void)
+{
+ long idx;
+ long si = lookup_instances;
+
+ typesafe_kmem_cachep = kmem_cache_create("refscale_typesafe",
+ sizeof(struct refscale_typesafe), sizeof(void *),
+ SLAB_TYPESAFE_BY_RCU, refscale_typesafe_ctor);
+ if (!typesafe_kmem_cachep)
+ return false;
+ if (si < 0)
+ si = -si * nr_cpu_ids;
+ else if (si == 0)
+ si = nr_cpu_ids;
+ rtsarray_size = si;
+ rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL);
+ if (!rtsarray)
+ return false;
+ for (idx = 0; idx < rtsarray_size; idx++) {
+ rtsarray[idx] = typesafe_alloc_one();
+ if (!rtsarray[idx])
+ return false;
+ }
+ if (cur_ops == &typesafe_ref_ops) {
+ rts_acquire = typesafe_ref_acquire;
+ rts_release = typesafe_ref_release;
+ } else if (cur_ops == &typesafe_lock_ops) {
+ rts_acquire = typesafe_lock_acquire;
+ rts_release = typesafe_lock_release;
+ } else if (cur_ops == &typesafe_seqlock_ops) {
+ rts_acquire = typesafe_seqlock_acquire;
+ rts_release = typesafe_seqlock_release;
+ } else {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ return true;
+}
+
+// Clean up after a typesafe test.
+static void typesafe_cleanup(void)
+{
+ long idx;
+
+ if (rtsarray) {
+ for (idx = 0; idx < rtsarray_size; idx++)
+ kmem_cache_free(typesafe_kmem_cachep, rtsarray[idx]);
+ kfree(rtsarray);
+ rtsarray = NULL;
+ rtsarray_size = 0;
+ }
+ kmem_cache_destroy(typesafe_kmem_cachep);
+ typesafe_kmem_cachep = NULL;
+ rts_acquire = NULL;
+ rts_release = NULL;
+}
+
+// The typesafe_init() function distinguishes these structures by address.
+static struct ref_scale_ops typesafe_ref_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_ref"
+};
+
+static struct ref_scale_ops typesafe_lock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_lock"
+};
+
+static struct ref_scale_ops typesafe_seqlock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_seqlock"
+};
+
static void rcu_scale_one_reader(void)
{
if (readdelay <= 0)
@@ -812,6 +1048,7 @@ ref_scale_init(void)
static struct ref_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
&rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
+ &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
if (!torture_init_begin(scale_type, verbose))
@@ -833,7 +1070,10 @@ ref_scale_init(void)
goto unwind;
}
if (cur_ops->init)
- cur_ops->init();
+ if (!cur_ops->init()) {
+ firsterr = -EUCLEAN;
+ goto unwind;
+ }
ref_scale_print_module_parms(cur_ops, "Start of test");
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ca4b5dcec675..ab4ee58af84b 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -154,7 +154,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
*/
static inline bool srcu_invl_snp_seq(unsigned long s)
{
- return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ;
+ return s == SRCU_SNP_INIT_SEQ;
}
/*
@@ -469,24 +469,59 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
/*
* If the locks are the same as the unlocks, then there must have
- * been no readers on this index at some time in between. This does
- * not mean that there are no more readers, as one could have read
- * the current index but not have incremented the lock counter yet.
+ * been no readers on this index at some point in this function.
+ * But there might be more readers, as a task might have read
+ * the current ->srcu_idx but not yet have incremented its CPU's
+ * ->srcu_lock_count[idx] counter. In fact, it is possible
+ * that most of the tasks have been preempted between fetching
+ * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there
+ * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks
+ * in a system whose address space was fully populated with memory.
+ * Call this quantity Nt.
*
- * So suppose that the updater is preempted here for so long
- * that more than ULONG_MAX non-nested readers come and go in
- * the meantime. It turns out that this cannot result in overflow
- * because if a reader modifies its unlock count after we read it
- * above, then that reader's next load of ->srcu_idx is guaranteed
- * to get the new value, which will cause it to operate on the
- * other bank of counters, where it cannot contribute to the
- * overflow of these counters. This means that there is a maximum
- * of 2*NR_CPUS increments, which cannot overflow given current
- * systems, especially not on 64-bit systems.
+ * So suppose that the updater is preempted at this point in the
+ * code for a long time. That now-preempted updater has already
+ * flipped ->srcu_idx (possibly during the preceding grace period),
+ * done an smp_mb() (again, possibly during the preceding grace
+ * period), and summed up the ->srcu_unlock_count[idx] counters.
+ * How many times can a given one of the aforementioned Nt tasks
+ * increment the old ->srcu_idx value's ->srcu_lock_count[idx]
+ * counter, in the absence of nesting?
*
- * OK, how about nesting? This does impose a limit on nesting
- * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
- * especially on 64-bit systems.
+ * It can clearly do so once, given that it has already fetched
+ * the old value of ->srcu_idx and is just about to use that value
+ * to index its increment of ->srcu_lock_count[idx]. But as soon as
+ * it leaves that SRCU read-side critical section, it will increment
+ * ->srcu_unlock_count[idx], which must follow the updater's above
+ * read from that same value. Thus, as soon the reading task does
+ * an smp_mb() and a later fetch from ->srcu_idx, that task will be
+ * guaranteed to get the new index. Except that the increment of
+ * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the
+ * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock()
+ * is before the smp_mb(). Thus, that task might not see the new
+ * value of ->srcu_idx until the -second- __srcu_read_lock(),
+ * which in turn means that this task might well increment
+ * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice,
+ * not just once.
+ *
+ * However, it is important to note that a given smp_mb() takes
+ * effect not just for the task executing it, but also for any
+ * later task running on that same CPU.
+ *
+ * That is, there can be almost Nt + Nc further increments of
+ * ->srcu_lock_count[idx] for the old index, where Nc is the number
+ * of CPUs. But this is OK because the size of the task_struct
+ * structure limits the value of Nt and current systems limit Nc
+ * to a few thousand.
+ *
+ * OK, but what about nesting? This does impose a limit on
+ * nesting of half of the size of the task_struct structure
+ * (measured in bytes), which should be sufficient. A late 2022
+ * TREE01 rcutorture run reported this size to be no less than
+ * 9408 bytes, allowing up to 4704 levels of nesting, which is
+ * comfortably beyond excessive. Especially on 64-bit systems,
+ * which are unlikely to be configured with an address space fully
+ * populated with memory, at least not anytime soon.
*/
return srcu_readers_lock_idx(ssp, idx) == unlocks;
}
@@ -726,7 +761,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
int state;
if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
- sdp = per_cpu_ptr(ssp->sda, 0);
+ sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = this_cpu_ptr(ssp->sda);
lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
@@ -837,7 +872,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
/* Initiate callback invocation as needed. */
ss_state = smp_load_acquire(&ssp->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
- srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
+ cbdelay);
} else {
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
srcu_for_each_node_breadth_first(ssp, snp) {
@@ -914,7 +950,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
if (snp)
for (; snp != NULL; snp = snp->srcu_parent) {
sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) ||
(!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
return;
spin_lock_irqsave_rcu_node(snp, flags);
@@ -941,6 +977,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
*
* Note that this function also does the work of srcu_funnel_exp_start(),
* in some cases by directly invoking it.
+ *
+ * The srcu read lock should be hold around this function. And s is a seq snap
+ * after holding that lock.
*/
static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
unsigned long s, bool do_norm)
@@ -961,7 +1000,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
if (snp_leaf)
/* Each pass through the loop does one level of the srcu_node tree. */
for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf)
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf)
return; /* GP already done and CBs recorded. */
spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
@@ -998,8 +1037,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
- /* If grace period not already done and none in progress, start it. */
- if (!rcu_seq_done(&ssp->srcu_gp_seq, s) &&
+ /* If grace period not already in progress, start it. */
+ if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) &&
rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
srcu_gp_start(ssp);
@@ -1059,10 +1098,11 @@ static void srcu_flip(struct srcu_struct *ssp)
/*
* Ensure that if the updater misses an __srcu_read_unlock()
- * increment, that task's next __srcu_read_lock() will see the
- * above counter update. Note that both this memory barrier
- * and the one in srcu_readers_active_idx_check() provide the
- * guarantee for __srcu_read_lock().
+ * increment, that task's __srcu_read_lock() following its next
+ * __srcu_read_lock() or __srcu_read_unlock() will see the above
+ * counter update. Note that both this memory barrier and the
+ * one in srcu_readers_active_idx_check() provide the guarantee
+ * for __srcu_read_lock().
*/
smp_mb(); /* D */ /* Pairs with C. */
}
@@ -1161,7 +1201,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
idx = __srcu_read_lock_nmisafe(ssp);
ss_state = smp_load_acquire(&ssp->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_CALL)
- sdp = per_cpu_ptr(ssp->sda, 0);
+ sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = raw_cpu_ptr(ssp->sda);
spin_lock_irqsave_sdp_contention(sdp, &flags);
@@ -1497,7 +1537,7 @@ void srcu_barrier(struct srcu_struct *ssp)
idx = __srcu_read_lock_nmisafe(ssp);
if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
- srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id()));
else
for_each_possible_cpu(cpu)
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index b0b885e071fa..bfb5e1549f2b 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -384,6 +384,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
{
int cpu;
unsigned long flags;
+ bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);
long n;
long ncbs = 0;
long ncbsnz = 0;
@@ -425,21 +426,23 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
smp_store_release(&rtp->percpu_enqueue_lim, 1);
rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
+ gpdone = false;
pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
}
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
}
- if (rcu_task_cb_adjust && !ncbsnz &&
- poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) {
+ if (rcu_task_cb_adjust && !ncbsnz && gpdone) {
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) {
WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
}
- for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
- struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+ if (rtp->percpu_dequeue_lim == 1) {
+ for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
- WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+ }
}
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
}
@@ -560,8 +563,9 @@ static int __noreturn rcu_tasks_kthread(void *arg)
static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
{
/* Complain if the scheduler has not started. */
- WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
- "synchronize_rcu_tasks called too soon");
+ if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+ "synchronize_%s() called too soon", rtp->name))
+ return;
// If the grace-period kthread is running, use it.
if (READ_ONCE(rtp->kthread_ptr)) {
@@ -827,11 +831,21 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
static void rcu_tasks_postscan(struct list_head *hop)
{
/*
- * Wait for tasks that are in the process of exiting. This
- * does only part of the job, ensuring that all tasks that were
- * previously exiting reach the point where they have disabled
- * preemption, allowing the later synchronize_rcu() to finish
- * the job.
+ * Exiting tasks may escape the tasklist scan. Those are vulnerable
+ * until their final schedule() with TASK_DEAD state. To cope with
+ * this, divide the fragile exit path part in two intersecting
+ * read side critical sections:
+ *
+ * 1) An _SRCU_ read side starting before calling exit_notify(),
+ * which may remove the task from the tasklist, and ending after
+ * the final preempt_disable() call in do_exit().
+ *
+ * 2) An _RCU_ read side starting with the final preempt_disable()
+ * call in do_exit() and ending with the final call to schedule()
+ * with TASK_DEAD state.
+ *
+ * This handles the part 1). And postgp will handle part 2) with a
+ * call to synchronize_rcu().
*/
synchronize_srcu(&tasks_rcu_exit_srcu);
}
@@ -898,7 +912,10 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
*
* In addition, this synchronize_rcu() waits for exiting tasks
* to complete their final preempt_disable() region of execution,
- * cleaning up after the synchronize_srcu() above.
+ * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
+ * enforcing the whole region before tasklist removal until
+ * the final schedule() with TASK_DEAD state to be an RCU TASKS
+ * read side critical section.
*/
synchronize_rcu();
}
@@ -988,27 +1005,42 @@ void show_rcu_tasks_classic_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
-/* Do the srcu_read_lock() for the above synchronize_srcu(). */
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
{
- preempt_disable();
current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
- preempt_enable();
}
-/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
-void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
+void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
{
struct task_struct *t = current;
- preempt_disable();
__srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
- preempt_enable();
- exit_tasks_rcu_finish_trace(t);
+}
+
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
+void exit_tasks_rcu_finish(void)
+{
+ exit_tasks_rcu_stop();
+ exit_tasks_rcu_finish_trace(current);
}
#else /* #ifdef CONFIG_TASKS_RCU */
void exit_tasks_rcu_start(void) { }
+void exit_tasks_rcu_stop(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
@@ -1036,9 +1068,6 @@ static void rcu_tasks_be_rude(struct work_struct *work)
// Wait for one rude RCU-tasks grace period.
static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
{
- if (num_online_cpus() <= 1)
- return; // Fastpath for only one CPU.
-
rtp->n_ipis += cpumask_weight(cpu_online_mask);
schedule_on_each_cpu(rcu_tasks_be_rude);
}
@@ -1535,6 +1564,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
{
// Wait for late-stage exiting tasks to finish exiting.
// These might have passed the call to exit_tasks_rcu_finish().
+
+ // If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
synchronize_rcu();
// Any tasks that exit after this point will set
// TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
@@ -1813,23 +1844,21 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
static void rcu_tasks_initiate_self_tests(void)
{
- unsigned long j = jiffies;
-
pr_info("Running RCU-tasks wait API self tests\n");
#ifdef CONFIG_TASKS_RCU
- tests[0].runstart = j;
+ tests[0].runstart = jiffies;
synchronize_rcu_tasks();
call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_RUDE_RCU
- tests[1].runstart = j;
+ tests[1].runstart = jiffies;
synchronize_rcu_tasks_rude();
call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
- tests[2].runstart = j;
+ tests[2].runstart = jiffies;
synchronize_rcu_tasks_trace();
call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
#endif
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 72913ce21258..42f7589e51e0 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -246,15 +246,12 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
#ifdef CONFIG_KASAN_GENERIC
-void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
{
- if (head) {
- void *ptr = (void *) head - (unsigned long) func;
-
+ if (head)
kasan_record_aux_stack_noalloc(ptr);
- }
- __kvfree_call_rcu(head, func);
+ __kvfree_call_rcu(head, ptr);
}
EXPORT_SYMBOL_GPL(kvfree_call_rcu);
#endif
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d04f2192f02c..8e880c09ab59 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -144,14 +144,16 @@ static int rcu_scheduler_fully_active __read_mostly;
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
-static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
+static bool rcu_init_invoked(void);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
/*
* rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
@@ -215,27 +217,6 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
/*
- * Compute the mask of online CPUs for the specified rcu_node structure.
- * This will not be stable unless the rcu_node structure's ->lock is
- * held, but the bit corresponding to the current CPU will be stable
- * in most contexts.
- */
-static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
-{
- return READ_ONCE(rnp->qsmaskinitnext);
-}
-
-/*
- * Is the CPU corresponding to the specified rcu_data structure online
- * from RCU's perspective? This perspective is given by that structure's
- * ->qsmaskinitnext field rather than by the global cpu_online_mask.
- */
-static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
-{
- return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
-}
-
-/*
* Return true if an RCU grace period is in progress. The READ_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
@@ -734,46 +715,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
}
-#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
-
-/*
- * Is the current CPU online as far as RCU is concerned?
- *
- * Disable preemption to avoid false positives that could otherwise
- * happen due to the current CPU number being sampled, this task being
- * preempted, its old CPU being taken offline, resuming on some other CPU,
- * then determining that its old CPU is now offline.
- *
- * Disable checking if in an NMI handler because we cannot safely
- * report errors from NMI handlers anyway. In addition, it is OK to use
- * RCU on an offline processor during initial boot, hence the check for
- * rcu_scheduler_fully_active.
- */
-bool rcu_lockdep_current_cpu_online(void)
-{
- struct rcu_data *rdp;
- bool ret = false;
-
- if (in_nmi() || !rcu_scheduler_fully_active)
- return true;
- preempt_disable_notrace();
- rdp = this_cpu_ptr(&rcu_data);
- /*
- * Strictly, we care here about the case where the current CPU is
- * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
- * not being up to date. So arch_spin_is_locked() might have a
- * false positive if it's held by some *other* CPU, but that's
- * OK because that just means a false *negative* on the warning.
- */
- if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
- ret = true;
- preempt_enable_notrace();
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-
-#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
-
/*
* When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
@@ -925,6 +866,24 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
rdp->rcu_iw_gp_seq = rnp->gp_seq;
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
}
+
+ if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
+ int cpu = rdp->cpu;
+ struct rcu_snap_record *rsrp;
+ struct kernel_cpustat *kcsp;
+
+ kcsp = &kcpustat_cpu(cpu);
+
+ rsrp = &rdp->snap_record;
+ rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+ rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
+ rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
+ rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+ rsrp->jiffies = jiffies;
+ rsrp->gp_seq = rdp->gp_seq;
+ }
}
return 0;
@@ -1350,19 +1309,12 @@ static void rcu_strict_gp_boundary(void *unused)
invoke_rcu_core();
}
-// Has rcu_init() been invoked? This is used (for example) to determine
-// whether spinlocks may be acquired safely.
-static bool rcu_init_invoked(void)
-{
- return !!rcu_state.n_online_cpus;
-}
-
// Make the polled API aware of the beginning of a grace period.
static void rcu_poll_gp_seq_start(unsigned long *snap)
{
struct rcu_node *rnp = rcu_get_root();
- if (rcu_init_invoked())
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
raw_lockdep_assert_held_rcu_node(rnp);
// If RCU was idle, note beginning of GP.
@@ -1378,7 +1330,7 @@ static void rcu_poll_gp_seq_end(unsigned long *snap)
{
struct rcu_node *rnp = rcu_get_root();
- if (rcu_init_invoked())
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
raw_lockdep_assert_held_rcu_node(rnp);
// If the previously noted GP is still in effect, record the
@@ -1401,7 +1353,8 @@ static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
struct rcu_node *rnp = rcu_get_root();
if (rcu_init_invoked()) {
- lockdep_assert_irqs_enabled();
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
rcu_poll_gp_seq_start(snap);
@@ -1417,7 +1370,8 @@ static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
struct rcu_node *rnp = rcu_get_root();
if (rcu_init_invoked()) {
- lockdep_assert_irqs_enabled();
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
rcu_poll_gp_seq_end(snap);
@@ -2090,92 +2044,6 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
}
/*
- * Near the end of the offline process. Trace the fact that this CPU
- * is going offline.
- */
-int rcutree_dying_cpu(unsigned int cpu)
-{
- bool blkd;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
- trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
- blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
- return 0;
-}
-
-/*
- * All CPUs for the specified rcu_node structure have gone offline,
- * and all tasks that were preempted within an RCU read-side critical
- * section while running on one of those CPUs have since exited their RCU
- * read-side critical section. Some other CPU is reporting this fact with
- * the specified rcu_node structure's ->lock held and interrupts disabled.
- * This function therefore goes up the tree of rcu_node structures,
- * clearing the corresponding bits in the ->qsmaskinit fields. Note that
- * the leaf rcu_node structure's ->qsmaskinit field has already been
- * updated.
- *
- * This function does check that the specified rcu_node structure has
- * all CPUs offline and no blocked tasks, so it is OK to invoke it
- * prematurely. That said, invoking it after the fact will cost you
- * a needless lock acquisition. So once it has done its work, don't
- * invoke it again.
- */
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
- long mask;
- struct rcu_node *rnp = rnp_leaf;
-
- raw_lockdep_assert_held_rcu_node(rnp_leaf);
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
- WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
- WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
- return;
- for (;;) {
- mask = rnp->grpmask;
- rnp = rnp->parent;
- if (!rnp)
- break;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rnp->qsmaskinit &= ~mask;
- /* Between grace periods, so better already be zero! */
- WARN_ON_ONCE(rnp->qsmask);
- if (rnp->qsmaskinit) {
- raw_spin_unlock_rcu_node(rnp);
- /* irqs remain disabled. */
- return;
- }
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
-}
-
-/*
- * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup.
- * There can only be one CPU hotplug operation at a time, so no need for
- * explicit locking.
- */
-int rcutree_dead_cpu(unsigned int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
- /* Adjust any no-longer-needed kthreads. */
- rcu_boost_kthread_setaffinity(rnp, -1);
- // Stop-machine done, so allow nohz_full to disable tick.
- tick_dep_clear(TICK_DEP_BIT_RCU);
- return 0;
-}
-
-/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Throttle as specified by rdp->blimit.
*/
@@ -2207,7 +2075,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
*/
rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
- pending = rcu_segcblist_n_cbs(&rdp->cblist);
+ pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL);
div = READ_ONCE(rcu_divisor);
div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
bl = max(rdp->blimit, pending >> div);
@@ -2725,10 +2593,11 @@ static void check_cb_ovld(struct rcu_data *rdp)
}
static void
-__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
{
static atomic_t doublefrees;
unsigned long flags;
+ bool lazy;
struct rcu_data *rdp;
bool was_alldone;
@@ -2753,6 +2622,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
kasan_record_aux_stack_noalloc(head);
local_irq_save(flags);
rdp = this_cpu_ptr(&rcu_data);
+ lazy = lazy_in && !rcu_async_should_hurry();
/* Add the callback to our list. */
if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
@@ -2874,13 +2744,15 @@ EXPORT_SYMBOL_GPL(call_rcu);
/**
* struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+ * @list: List node. All blocks are linked between each other
+ * @gp_snap: Snapshot of RCU state for objects placed to this bulk
* @nr_records: Number of active pointers in the array
- * @next: Next bulk object in the block chain
* @records: Array of the kvfree_rcu() pointers
*/
struct kvfree_rcu_bulk_data {
+ struct list_head list;
+ unsigned long gp_snap;
unsigned long nr_records;
- struct kvfree_rcu_bulk_data *next;
void *records[];
};
@@ -2896,26 +2768,28 @@ struct kvfree_rcu_bulk_data {
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
+ * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
- struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
+ struct list_head bulk_head_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
+ * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
+ * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
* @initialized: The @rcu_work fields have been initialized
- * @count: Number of objects for which GP not started
+ * @head_count: Number of objects in rcu_head singular list
+ * @bulk_count: Number of objects in bulk-list
* @bkvcache:
* A simple cache list that contains objects for reuse purpose.
* In order to save some per-cpu space the list is singular.
@@ -2933,13 +2807,20 @@ struct kfree_rcu_cpu_work {
* the interactions with the slab allocators.
*/
struct kfree_rcu_cpu {
+ // Objects queued on a linked list
+ // through their rcu_head structures.
struct rcu_head *head;
- struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
+ unsigned long head_gp_snap;
+ atomic_t head_count;
+
+ // Objects queued on a bulk-list.
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ atomic_t bulk_count[FREE_N_CHANNELS];
+
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
raw_spinlock_t lock;
struct delayed_work monitor_work;
bool initialized;
- int count;
struct delayed_work page_cache_work;
atomic_t backoff_page_cache_fill;
@@ -3027,29 +2908,87 @@ drain_page_cache(struct kfree_rcu_cpu *krcp)
return freed;
}
+static void
+kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode, int idx)
+{
+ unsigned long flags;
+ int i;
+
+ debug_rcu_bhead_unqueue(bnode);
+
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name, bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
+ }
+ rcu_lock_release(&rcu_callback_map);
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ if (put_cached_bnode(krcp, bnode))
+ bnode = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (bnode)
+ free_page((unsigned long) bnode);
+
+ cond_resched_tasks_rcu_qs();
+}
+
+static void
+kvfree_rcu_list(struct rcu_head *head)
+{
+ struct rcu_head *next;
+
+ for (; head; head = next) {
+ void *ptr = (void *) head->func;
+ unsigned long offset = (void *) head - ptr;
+
+ next = head->next;
+ debug_rcu_head_unqueue((struct rcu_head *)ptr);
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
+
+ if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
+ kvfree(ptr);
+
+ rcu_lock_release(&rcu_callback_map);
+ cond_resched_tasks_rcu_qs();
+ }
+}
+
/*
* This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bkvhead_free or ->head_free.
+ * It frees all the objects queued on ->bulk_head_free or ->head_free.
*/
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
- struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
- struct rcu_head *head, *next;
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ struct rcu_head *head;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
- int i, j;
+ int i;
krwp = container_of(to_rcu_work(work),
- struct kfree_rcu_cpu_work, rcu_work);
+ struct kfree_rcu_cpu_work, rcu_work);
krcp = krwp->krcp;
raw_spin_lock_irqsave(&krcp->lock, flags);
// Channels 1 and 2.
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- bkvhead[i] = krwp->bkvhead_free[i];
- krwp->bkvhead_free[i] = NULL;
- }
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
// Channel 3.
head = krwp->head_free;
@@ -3058,39 +2997,9 @@ static void kfree_rcu_work(struct work_struct *work)
// Handle the first two channels.
for (i = 0; i < FREE_N_CHANNELS; i++) {
- for (; bkvhead[i]; bkvhead[i] = bnext) {
- bnext = bkvhead[i]->next;
- debug_rcu_bhead_unqueue(bkvhead[i]);
-
- rcu_lock_acquire(&rcu_callback_map);
- if (i == 0) { // kmalloc() / kfree().
- trace_rcu_invoke_kfree_bulk_callback(
- rcu_state.name, bkvhead[i]->nr_records,
- bkvhead[i]->records);
-
- kfree_bulk(bkvhead[i]->nr_records,
- bkvhead[i]->records);
- } else { // vmalloc() / vfree().
- for (j = 0; j < bkvhead[i]->nr_records; j++) {
- trace_rcu_invoke_kvfree_callback(
- rcu_state.name,
- bkvhead[i]->records[j], 0);
-
- vfree(bkvhead[i]->records[j]);
- }
- }
- rcu_lock_release(&rcu_callback_map);
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (put_cached_bnode(krcp, bkvhead[i]))
- bkvhead[i] = NULL;
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- if (bkvhead[i])
- free_page((unsigned long) bkvhead[i]);
-
- cond_resched_tasks_rcu_qs();
- }
+ // Start from the tail page, so a GP is likely passed for it.
+ list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
}
/*
@@ -3100,21 +3009,7 @@ static void kfree_rcu_work(struct work_struct *work)
* queued on a linked list through their rcu_head structures.
* This list is named "Channel 3".
*/
- for (; head; head = next) {
- unsigned long offset = (unsigned long)head->func;
- void *ptr = (void *)head - offset;
-
- next = head->next;
- debug_rcu_head_unqueue((struct rcu_head *)ptr);
- rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
-
- if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
- kvfree(ptr);
-
- rcu_lock_release(&rcu_callback_map);
- cond_resched_tasks_rcu_qs();
- }
+ kvfree_rcu_list(head);
}
static bool
@@ -3123,10 +3018,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)
int i;
for (i = 0; i < FREE_N_CHANNELS; i++)
- if (krcp->bkvhead[i])
+ if (!list_empty(&krcp->bulk_head[i]))
return true;
- return !!krcp->head;
+ return !!READ_ONCE(krcp->head);
+}
+
+static int krc_count(struct kfree_rcu_cpu *krcp)
+{
+ int sum = atomic_read(&krcp->head_count);
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ sum += atomic_read(&krcp->bulk_count[i]);
+
+ return sum;
}
static void
@@ -3134,7 +3040,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
{
long delay, delay_left;
- delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+ delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
if (delayed_work_pending(&krcp->monitor_work)) {
delay_left = krcp->monitor_work.timer.expires - jiffies;
if (delay < delay_left)
@@ -3144,6 +3050,44 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
queue_delayed_work(system_wq, &krcp->monitor_work, delay);
}
+static void
+kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
+{
+ struct list_head bulk_ready[FREE_N_CHANNELS];
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct rcu_head *head_ready = NULL;
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ INIT_LIST_HEAD(&bulk_ready[i]);
+
+ list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
+ if (!poll_state_synchronize_rcu(bnode->gp_snap))
+ break;
+
+ atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
+ list_move(&bnode->list, &bulk_ready[i]);
+ }
+ }
+
+ if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
+ head_ready = krcp->head;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
+
+ if (head_ready)
+ kvfree_rcu_list(head_ready);
+}
+
/*
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
@@ -3154,26 +3098,31 @@ static void kfree_rcu_monitor(struct work_struct *work)
unsigned long flags;
int i, j;
+ // Drain ready for reclaim.
+ kvfree_rcu_drain_ready(krcp);
+
raw_spin_lock_irqsave(&krcp->lock, flags);
// Attempt to start a new batch.
for (i = 0; i < KFREE_N_BATCHES; i++) {
struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
- // Try to detach bkvhead or head and attach it over any
+ // Try to detach bulk_head or head and attach it over any
// available corresponding free channel. It can be that
// a previous RCU batch is in progress, it means that
// immediately to queue another one is not possible so
// in that case the monitor work is rearmed.
- if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
- (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
- (krcp->head && !krwp->head_free)) {
+ if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) ||
+ (!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) ||
+ (READ_ONCE(krcp->head) && !krwp->head_free)) {
+
// Channel 1 corresponds to the SLAB-pointer bulk path.
// Channel 2 corresponds to vmalloc-pointer bulk path.
for (j = 0; j < FREE_N_CHANNELS; j++) {
- if (!krwp->bkvhead_free[j]) {
- krwp->bkvhead_free[j] = krcp->bkvhead[j];
- krcp->bkvhead[j] = NULL;
+ if (list_empty(&krwp->bulk_head_free[j])) {
+ atomic_set(&krcp->bulk_count[j], 0);
+ list_replace_init(&krcp->bulk_head[j],
+ &krwp->bulk_head_free[j]);
}
}
@@ -3181,11 +3130,10 @@ static void kfree_rcu_monitor(struct work_struct *work)
// objects queued on the linked list.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
- krcp->head = NULL;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
}
- WRITE_ONCE(krcp->count, 0);
-
// One work is per one batch, so there are three
// "free channels", the batch can handle. It can
// be that the work is in the pending state when
@@ -3195,6 +3143,8 @@ static void kfree_rcu_monitor(struct work_struct *work)
}
}
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
// If there is nothing to detach, it means that our job is
// successfully done here. In case of having at least one
// of the channels that is still busy we should rearm the
@@ -3202,8 +3152,6 @@ static void kfree_rcu_monitor(struct work_struct *work)
// still in progress.
if (need_offload_krc(krcp))
schedule_delayed_monitor_work(krcp);
-
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
static enum hrtimer_restart
@@ -3286,10 +3234,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
return false;
idx = !!is_vmalloc_addr(ptr);
+ bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+ struct kvfree_rcu_bulk_data, list);
/* Check if a new block is required. */
- if (!(*krcp)->bkvhead[idx] ||
- (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
+ if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
bnode = get_cached_bnode(*krcp);
if (!bnode && can_alloc) {
krc_this_cpu_unlock(*krcp, *flags);
@@ -3313,17 +3262,15 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
if (!bnode)
return false;
- /* Initialize the new block. */
+ // Initialize the new block and attach it.
bnode->nr_records = 0;
- bnode->next = (*krcp)->bkvhead[idx];
-
- /* Attach it to the head. */
- (*krcp)->bkvhead[idx] = bnode;
+ list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
}
- /* Finally insert. */
- (*krcp)->bkvhead[idx]->records
- [(*krcp)->bkvhead[idx]->nr_records++] = ptr;
+ // Finally insert and update the GP for this page.
+ bnode->records[bnode->nr_records++] = ptr;
+ bnode->gp_snap = get_state_synchronize_rcu();
+ atomic_inc(&(*krcp)->bulk_count[idx]);
return true;
}
@@ -3340,26 +3287,21 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
* be free'd in workqueue context. This allows us to: batch requests together to
* reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/
-void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp;
bool success;
- void *ptr;
- if (head) {
- ptr = (void *) head - (unsigned long) func;
- } else {
- /*
- * Please note there is a limitation for the head-less
- * variant, that is why there is a clear rule for such
- * objects: it can be used from might_sleep() context
- * only. For other places please embed an rcu_head to
- * your data.
- */
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ if (!head)
might_sleep();
- ptr = (unsigned long *) func;
- }
// Queue the object but don't yet schedule the batch.
if (debug_rcu_head_queue(ptr)) {
@@ -3380,14 +3322,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
// Inline if kvfree_rcu(one_arg) call.
goto unlock_return;
- head->func = func;
+ head->func = ptr;
head->next = krcp->head;
- krcp->head = head;
+ WRITE_ONCE(krcp->head, head);
+ atomic_inc(&krcp->head_count);
+
+ // Take a snapshot for this krcp.
+ krcp->head_gp_snap = get_state_synchronize_rcu();
success = true;
}
- WRITE_ONCE(krcp->count, krcp->count + 1);
-
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
schedule_delayed_monitor_work(krcp);
@@ -3418,7 +3362,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- count += READ_ONCE(krcp->count);
+ count += krc_count(krcp);
count += READ_ONCE(krcp->nr_bkv_objs);
atomic_set(&krcp->backoff_page_cache_fill, 1);
}
@@ -3435,7 +3379,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
int count;
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- count = krcp->count;
+ count = krc_count(krcp);
count += drain_page_cache(krcp);
kfree_rcu_monitor(&krcp->monitor_work.work);
@@ -3459,15 +3403,12 @@ static struct shrinker kfree_rcu_shrinker = {
void __init kfree_rcu_scheduler_running(void)
{
int cpu;
- unsigned long flags;
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- raw_spin_lock_irqsave(&krcp->lock, flags);
if (need_offload_krc(krcp))
schedule_delayed_monitor_work(krcp);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
@@ -3483,9 +3424,10 @@ void __init kfree_rcu_scheduler_running(void)
*/
static int rcu_blocking_is_gp(void)
{
- if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) {
+ might_sleep();
return false;
- might_sleep(); /* Check for RCU read-side critical section. */
+ }
return true;
}
@@ -3709,7 +3651,9 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
* If @false is returned, it is the caller's responsibility to invoke this
* function later on until it does return @true. Alternatively, the caller
* can explicitly wait for a grace period, for example, by passing @oldstate
- * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited()
+ * on the one hand or by directly invoking either synchronize_rcu() or
+ * synchronize_rcu_expedited() on the other.
*
* Yes, this function does not take counter wrap into account.
* But counter wrap is harmless. If the counter wraps, we have waited for
@@ -3720,6 +3664,12 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
* completed. Alternatively, they can use get_completed_synchronize_rcu()
* to get a guaranteed-completed grace-period state.
*
+ * In addition, because oldstate compresses the grace-period state for
+ * both normal and expedited grace periods into a single unsigned long,
+ * it can miss a grace period when synchronize_rcu() runs concurrently
+ * with synchronize_rcu_expedited(). If this is unacceptable, please
+ * instead use the _full() variant of these polling APIs.
+ *
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
* to the function that provided @oldstate, and that returned at the end
@@ -4078,6 +4028,155 @@ retry:
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+ return READ_ONCE(rnp->qsmaskinitnext);
+}
+
+/*
+ * Is the CPU corresponding to the specified rcu_data structure online
+ * from RCU's perspective? This perspective is given by that structure's
+ * ->qsmaskinitnext field rather than by the global cpu_online_mask.
+ */
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
+{
+ return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
+}
+
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
+
+/*
+ * Is the current CPU online as far as RCU is concerned?
+ *
+ * Disable preemption to avoid false positives that could otherwise
+ * happen due to the current CPU number being sampled, this task being
+ * preempted, its old CPU being taken offline, resuming on some other CPU,
+ * then determining that its old CPU is now offline.
+ *
+ * Disable checking if in an NMI handler because we cannot safely
+ * report errors from NMI handlers anyway. In addition, it is OK to use
+ * RCU on an offline processor during initial boot, hence the check for
+ * rcu_scheduler_fully_active.
+ */
+bool rcu_lockdep_current_cpu_online(void)
+{
+ struct rcu_data *rdp;
+ bool ret = false;
+
+ if (in_nmi() || !rcu_scheduler_fully_active)
+ return true;
+ preempt_disable_notrace();
+ rdp = this_cpu_ptr(&rcu_data);
+ /*
+ * Strictly, we care here about the case where the current CPU is
+ * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+ * not being up to date. So arch_spin_is_locked() might have a
+ * false positive if it's held by some *other* CPU, but that's
+ * OK because that just means a false *negative* on the warning.
+ */
+ if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
+ ret = true;
+ preempt_enable_notrace();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
+
+// Has rcu_init() been invoked? This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+ return !!rcu_state.n_online_cpus;
+}
+
+/*
+ * Near the end of the offline process. Trace the fact that this CPU
+ * is going offline.
+ */
+int rcutree_dying_cpu(unsigned int cpu)
+{
+ bool blkd;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return 0;
+
+ blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
+ trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
+ return 0;
+}
+
+/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section. Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields. Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated.
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely. That said, invoking it after the fact will cost you
+ * a needless lock acquisition. So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ raw_lockdep_assert_held_rcu_node(rnp_leaf);
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
+ WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
+ return;
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (!rnp)
+ break;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ rnp->qsmaskinit &= ~mask;
+ /* Between grace periods, so better already be zero! */
+ WARN_ON_ONCE(rnp->qsmask);
+ if (rnp->qsmaskinit) {
+ raw_spin_unlock_rcu_node(rnp);
+ /* irqs remain disabled. */
+ return;
+ }
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+}
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
+ */
+int rcutree_dead_cpu(unsigned int cpu)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return 0;
+
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
+ return 0;
+}
+
+/*
* Propagate ->qsinitmask bits up the rcu_node tree to account for the
* first CPU in a given leaf rcu_node structure coming online. The caller
* must hold the corresponding leaf rcu_node ->lock with interrupts
@@ -4406,11 +4505,13 @@ static int rcu_pm_notify(struct notifier_block *self,
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
+ rcu_async_hurry();
rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
rcu_unexpedite_gp();
+ rcu_async_relax();
break;
default:
break;
@@ -4764,7 +4865,7 @@ struct workqueue_struct *rcu_gp_wq;
static void __init kfree_rcu_batch_init(void)
{
int cpu;
- int i;
+ int i, j;
/* Clamp it to [0:100] seconds interval. */
if (rcu_delay_page_cache_fill_msec < 0 ||
@@ -4784,8 +4885,14 @@ static void __init kfree_rcu_batch_init(void)
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
+
+ for (j = 0; j < FREE_N_CHANNELS; j++)
+ INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
}
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ INIT_LIST_HEAD(&krcp->bulk_head[i]);
+
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
@@ -4836,6 +4943,8 @@ void __init rcu_init(void)
// Kick-start any polled grace periods that started early.
if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
(void)start_poll_synchronize_rcu_expedited();
+
+ rcu_test_sync_prims();
}
#include "tree_stall.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fcb5d696eb17..192536916f9a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -158,6 +158,23 @@ union rcu_noqs {
u16 s; /* Set of bits, aggregate OR here. */
};
+/*
+ * Record the snapshot of the core stats at half of the first RCU stall timeout.
+ * The member gp_seq is used to ensure that all members are updated only once
+ * during the sampling period. The snapshot is taken only if this gp_seq is not
+ * equal to rdp->gp_seq.
+ */
+struct rcu_snap_record {
+ unsigned long gp_seq; /* Track rdp->gp_seq counter */
+ u64 cputime_irq; /* Accumulated cputime of hard irqs */
+ u64 cputime_softirq;/* Accumulated cputime of soft irqs */
+ u64 cputime_system; /* Accumulated cputime of kernel tasks */
+ unsigned long nr_hardirqs; /* Accumulated number of hard irqs */
+ unsigned int nr_softirqs; /* Accumulated number of soft irqs */
+ unsigned long long nr_csw; /* Accumulated number of task switches */
+ unsigned long jiffies; /* Track jiffies value */
+};
+
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
@@ -262,6 +279,8 @@ struct rcu_data {
short rcu_onl_gp_flags; /* ->gp_flags at last online. */
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
+ struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */
+ /* the first RCU stall timeout */
long lazy_len; /* Length of buffered lazy callbacks. */
int cpu;
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index ed6c3cce28f2..249c2967d9e6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -11,6 +11,7 @@
static void rcu_exp_handler(void *unused);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp);
/*
* Record the start of an expedited grace period.
@@ -667,8 +668,11 @@ static void synchronize_rcu_expedited_wait(void)
mask = leaf_node_cpu_bit(rnp, cpu);
if (!(READ_ONCE(rnp->expmask) & mask))
continue;
+ preempt_disable(); // For smp_processor_id() in dump_cpu_task().
dump_cpu_task(cpu);
+ preempt_enable();
}
+ rcu_exp_print_detail_task_stall_rnp(rnp);
}
jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
panic_on_rcu_stall();
@@ -811,6 +815,36 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
return ndetected;
}
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, dumping the stack of each that is blocking the current
+ * expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ if (!rcu_exp_stall_task_details)
+ return;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!READ_ONCE(rnp->exp_tasks)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+ sched_show_task(t);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
#else /* #ifdef CONFIG_PREEMPT_RCU */
/* Request an expedited quiescent state. */
@@ -883,6 +917,15 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
return 0;
}
+/*
+ * Because preemptible RCU does not exist, we never have to print out
+ * tasks blocked within RCU read-side critical sections that are blocking
+ * the current expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/**
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 5653560573e2..b10b8349bb2a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -39,7 +39,7 @@ int rcu_exp_jiffies_till_stall_check(void)
// CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.
// The minimum clamped value is "2UL", because at least one full
// tick has to be guaranteed.
- till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ);
+ till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 300UL * HZ);
if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)
WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check));
@@ -428,6 +428,35 @@ static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp
return j > 2 * HZ;
}
+static void print_cpu_stat_info(int cpu)
+{
+ struct rcu_snap_record rsr, *rsrp;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu);
+
+ if (!rcu_cpu_stall_cputime)
+ return;
+
+ rsrp = &rdp->snap_record;
+ if (rsrp->gp_seq != rdp->gp_seq)
+ return;
+
+ rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+
+ pr_err("\t hardirqs softirqs csw/system\n");
+ pr_err("\t number: %8ld %10d %12lld\n",
+ kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
+ kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
+ nr_context_switches_cpu(cpu) - rsrp->nr_csw);
+ pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n",
+ div_u64(rsr.cputime_irq - rsrp->cputime_irq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_softirq - rsrp->cputime_softirq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_system - rsrp->cputime_system, NSEC_PER_MSEC),
+ jiffies_to_msecs(jiffies - rsrp->jiffies));
+}
+
/*
* Print out diagnostic information for the specified stalled CPU.
*
@@ -484,6 +513,8 @@ static void print_cpu_stall_info(int cpu)
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
rcuc_starved ? buf : "",
falsepositive ? " (false positive?)" : "");
+
+ print_cpu_stat_info(cpu);
}
/* Complain about starvation of grace-period kthread. */
@@ -588,7 +619,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
+ pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
smp_processor_id(), (long)(jiffies - gps),
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
if (ndetected) {
@@ -649,7 +680,7 @@ static void print_cpu_stall(unsigned long gps)
raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
+ pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
jiffies - gps,
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f5e6a2f95a2a..19bf6fa3ee6a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -144,8 +144,45 @@ bool rcu_gp_is_normal(void)
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
-static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
+static atomic_t rcu_async_hurry_nesting = ATOMIC_INIT(1);
+/*
+ * Should call_rcu() callbacks be processed with urgency or are
+ * they OK being executed with arbitrary delays?
+ */
+bool rcu_async_should_hurry(void)
+{
+ return !IS_ENABLED(CONFIG_RCU_LAZY) ||
+ atomic_read(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_should_hurry);
+
+/**
+ * rcu_async_hurry - Make future async RCU callbacks not lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a timely fashion.
+ */
+void rcu_async_hurry(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_inc(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_hurry);
+/**
+ * rcu_async_relax - Make future async RCU callbacks lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a lazy fashion.
+ */
+void rcu_async_relax(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_dec(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_relax);
+
+static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
@@ -195,6 +232,7 @@ static bool rcu_boot_ended __read_mostly;
void rcu_end_inkernel_boot(void)
{
rcu_unexpedite_gp();
+ rcu_async_relax();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
rcu_boot_ended = true;
@@ -220,6 +258,7 @@ void rcu_test_sync_prims(void)
{
if (!IS_ENABLED(CONFIG_PROVE_RCU))
return;
+ pr_info("Running RCU synchronous self tests\n");
synchronize_rcu();
synchronize_rcu_expedited();
}
@@ -508,6 +547,10 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT;
module_param(rcu_exp_cpu_stall_timeout, int, 0644);
+int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
+module_param(rcu_cpu_stall_cputime, int, 0644);
+bool rcu_exp_stall_task_details __read_mostly;
+module_param(rcu_exp_stall_task_details, bool, 0644);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall
@@ -555,9 +598,12 @@ struct early_boot_kfree_rcu {
static void early_boot_test_call_rcu(void)
{
static struct rcu_head head;
+ int idx;
static struct rcu_head shead;
struct early_boot_kfree_rcu *rhp;
+ idx = srcu_down_read(&early_srcu);
+ srcu_up_read(&early_srcu, idx);
call_rcu(&head, test_callback);
early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
call_srcu(&early_srcu, &shead, test_callback);
@@ -586,6 +632,7 @@ static int rcu_verify_early_boot_tests(void)
early_boot_test_counter++;
srcu_barrier(&early_srcu);
WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
+ cleanup_srcu_struct(&early_srcu);
}
if (rcu_self_test_counter != early_boot_test_counter) {
WARN_ON(1);
diff --git a/kernel/relay.c b/kernel/relay.c
index d7edc934c56d..ef12532168d9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -148,13 +148,13 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan)
{
struct rchan_buf *buf;
- if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *))
+ if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t))
return NULL;
buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
- buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *),
+ buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t),
GFP_KERNEL);
if (!buf->padding)
goto free_buf;
@@ -507,7 +507,7 @@ struct rchan *relay_open(const char *base_filename,
chan->private_data = private_data;
if (base_filename) {
chan->has_base_filename = 1;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
}
chan->cb = cb;
kref_init(&chan->kref);
@@ -578,7 +578,7 @@ int relay_late_setup_files(struct rchan *chan,
if (!chan || !base_filename)
return -EINVAL;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
mutex_lock(&relay_channels_mutex);
/* Is chan already set up? */
diff --git a/kernel/resource.c b/kernel/resource.c
index 4c5e80b92f2f..ddbbacb9fb50 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -888,7 +888,7 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
if (conflict->end > new->end)
new->end = conflict->end;
- printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+ pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
}
write_unlock(&resource_lock);
}
@@ -1283,9 +1283,7 @@ void __release_region(struct resource *parent, resource_size_t start,
write_unlock(&resource_lock);
- printk(KERN_WARNING "Trying to free nonexistent resource "
- "<%016llx-%016llx>\n", (unsigned long long)start,
- (unsigned long long)end);
+ pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end);
}
EXPORT_SYMBOL(__release_region);
@@ -1658,6 +1656,7 @@ __setup("reserve=", reserve_setup);
int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
{
struct resource *p = &iomem_resource;
+ resource_size_t end = addr + size - 1;
int err = 0;
loff_t l;
@@ -1667,12 +1666,12 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
* We can probably skip the resources without
* IORESOURCE_IO attribute?
*/
- if (p->start >= addr + size)
+ if (p->start > end)
continue;
if (p->end < addr)
continue;
if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
- PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
+ PFN_DOWN(p->end) >= PFN_DOWN(end))
continue;
/*
* if a resource is "BUSY", it's not a hardware resource
@@ -1683,10 +1682,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
if (p->flags & IORESOURCE_BUSY)
continue;
- printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
- (unsigned long long)addr,
- (unsigned long long)(addr + size - 1),
- p->name, p);
+ pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n",
+ &addr, &end, p->name, p);
err = -1;
break;
}
@@ -1707,18 +1704,15 @@ static int strict_iomem_checks;
*
* Returns true if exclusive to the kernel, otherwise returns false.
*/
-bool iomem_is_exclusive(u64 addr)
+bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size)
{
const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
IORESOURCE_EXCLUSIVE;
bool skip_children = false, err = false;
- int size = PAGE_SIZE;
struct resource *p;
- addr = addr & PAGE_MASK;
-
read_lock(&resource_lock);
- for_each_resource(&iomem_resource, p, skip_children) {
+ for_each_resource(root, p, skip_children) {
if (p->start >= addr + size)
break;
if (p->end < addr) {
@@ -1757,6 +1751,12 @@ bool iomem_is_exclusive(u64 addr)
return err;
}
+bool iomem_is_exclusive(u64 addr)
+{
+ return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK,
+ PAGE_SIZE);
+}
+
struct resource_entry *resource_list_create_entry(struct resource *res,
size_t extra_size)
{
diff --git a/kernel/rseq.c b/kernel/rseq.c
index d38ab944105d..9de6e35fe679 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -18,6 +18,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
@@ -82,15 +85,25 @@
* F1. <failure>
*/
-static int rseq_update_cpu_id(struct task_struct *t)
+static int rseq_update_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;
+ u32 cpu_id = raw_smp_processor_id();
+ u32 node_id = cpu_to_node(cpu_id);
+ u32 mm_cid = task_mm_cid(t);
- if (!user_write_access_begin(rseq, sizeof(*rseq)))
+ WARN_ON_ONCE((int) mm_cid < 0);
+ if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_put_user(node_id, &rseq->node_id, efault_end);
+ unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end);
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally updated only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
user_write_access_end();
trace_rseq_update(t);
return 0;
@@ -101,9 +114,10 @@ efault:
return -EFAULT;
}
-static int rseq_reset_rseq_cpu_id(struct task_struct *t)
+static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+ u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
+ mm_cid = 0;
/*
* Reset cpu_id_start to its initial state (0).
@@ -117,6 +131,21 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Reset node_id to its initial state (0).
+ */
+ if (put_user(node_id, &t->rseq->node_id))
+ return -EFAULT;
+ /*
+ * Reset mm_cid to its initial state (0).
+ */
+ if (put_user(mm_cid, &t->rseq->mm_cid))
+ return -EFAULT;
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally reset only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
return 0;
}
@@ -301,7 +330,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(ret < 0))
goto error;
}
- if (unlikely(rseq_update_cpu_id(t)))
+ if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
return;
@@ -344,15 +373,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (rseq_len != sizeof(*rseq))
+ if (rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
- ret = rseq_reset_rseq_cpu_id(current);
+ ret = rseq_reset_rseq_cpu_node_id(current);
if (ret)
return ret;
current->rseq = NULL;
current->rseq_sig = 0;
+ current->rseq_len = 0;
return 0;
}
@@ -365,7 +395,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || rseq_len != sizeof(*rseq))
+ if (current->rseq != rseq || rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -374,15 +404,24 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
}
/*
- * If there was no rseq previously registered,
- * ensure the provided rseq is properly aligned and valid.
+ * If there was no rseq previously registered, ensure the provided rseq
+ * is properly aligned, as communcated to user-space through the ELF
+ * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
+ * size, the required alignment is the original struct rseq alignment.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
- if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
- rseq_len != sizeof(*rseq))
+ if (rseq_len < ORIG_RSEQ_SIZE ||
+ (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
+ (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+ rseq_len < offsetof(struct rseq, end))))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
+ current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e374c0c923da..5732fa75ebab 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -93,7 +93,7 @@ struct sched_clock_data {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-notrace static inline struct sched_clock_data *this_scd(void)
+static __always_inline struct sched_clock_data *this_scd(void)
{
return this_cpu_ptr(&sched_clock_data);
}
@@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late);
* min, max except they take wrapping into account
*/
-notrace static inline u64 wrap_min(u64 x, u64 y)
+static __always_inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
-notrace static inline u64 wrap_max(u64 x, u64 y)
+static __always_inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
@@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-notrace static u64 sched_clock_local(struct sched_clock_data *scd)
+static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
@@ -287,13 +287,28 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (!try_cmpxchg64(&scd->clock, &old_clock, clock))
+ if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
}
-notrace static u64 sched_clock_remote(struct sched_clock_data *scd)
+noinstr u64 local_clock(void)
+{
+ u64 clock;
+
+ if (static_branch_likely(&__sched_clock_stable))
+ return sched_clock() + __sched_clock_offset;
+
+ preempt_disable_notrace();
+ clock = sched_clock_local(this_scd());
+ preempt_enable_notrace();
+
+ return clock;
+}
+EXPORT_SYMBOL_GPL(local_clock);
+
+static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index daff72f00385..af017e038b48 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -152,7 +152,7 @@ __read_mostly int scheduler_running;
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
/* kernel prio, less is more */
-static inline int __task_prio(struct task_struct *p)
+static inline int __task_prio(const struct task_struct *p)
{
if (p->sched_class == &stop_sched_class) /* trumps deadline */
return -2;
@@ -174,7 +174,8 @@ static inline int __task_prio(struct task_struct *p)
*/
/* real prio, less is less */
-static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+static inline bool prio_less(const struct task_struct *a,
+ const struct task_struct *b, bool in_fi)
{
int pa = __task_prio(a), pb = __task_prio(b);
@@ -194,7 +195,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool
return false;
}
-static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
+static inline bool __sched_core_less(const struct task_struct *a,
+ const struct task_struct *b)
{
if (a->core_cookie < b->core_cookie)
return true;
@@ -1392,7 +1394,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
- WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+ uclamp_rq_set(rq, clamp_id, clamp_value);
}
static inline
@@ -1543,8 +1545,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
- if (uc_se->value > READ_ONCE(uc_rq->value))
- WRITE_ONCE(uc_rq->value, uc_se->value);
+ if (uc_se->value > uclamp_rq_get(rq, clamp_id))
+ uclamp_rq_set(rq, clamp_id, uc_se->value);
}
/*
@@ -1610,7 +1612,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
if (likely(bucket->tasks))
return;
- rq_clamp = READ_ONCE(uc_rq->value);
+ rq_clamp = uclamp_rq_get(rq, clamp_id);
/*
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fixup the expected value.
@@ -1618,7 +1620,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
- WRITE_ONCE(uc_rq->value, bkt_clamp);
+ uclamp_rq_set(rq, clamp_id, bkt_clamp);
}
}
@@ -2053,7 +2055,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_RESTORE)) {
sched_info_enqueue(rq, p);
- psi_enqueue(p, flags & ENQUEUE_WAKEUP);
+ psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
}
uclamp_rq_inc(rq, p);
@@ -2189,14 +2191,18 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
static void
-__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags);
+ struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
+ struct affinity_context ac = {
+ .new_mask = cpumask_of(rq->cpu),
+ .flags = SCA_MIGRATE_DISABLE,
+ };
+
if (likely(!p->migration_disabled))
return;
@@ -2206,7 +2212,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
/*
* Violates locking rules! see comment in __do_set_cpus_allowed().
*/
- __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+ __do_set_cpus_allowed(p, &ac);
}
void migrate_disable(void)
@@ -2228,6 +2234,10 @@ EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
{
struct task_struct *p = current;
+ struct affinity_context ac = {
+ .new_mask = &p->cpus_mask,
+ .flags = SCA_MIGRATE_ENABLE,
+ };
if (p->migration_disabled > 1) {
p->migration_disabled--;
@@ -2243,7 +2253,7 @@ void migrate_enable(void)
*/
preempt_disable();
if (p->cpus_ptr != &p->cpus_mask)
- __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ __set_cpus_allowed_ptr(p, &ac);
/*
* Mustn't clear migration_disabled() until cpus_ptr points back at the
* regular cpus_mask, otherwise things that race (eg.
@@ -2523,19 +2533,25 @@ out_unlock:
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
*/
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)
{
- if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
- p->cpus_ptr = new_mask;
+ if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+ p->cpus_ptr = ctx->new_mask;
return;
}
- cpumask_copy(&p->cpus_mask, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
+ cpumask_copy(&p->cpus_mask, ctx->new_mask);
+ p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+
+ /*
+ * Swap in a new user_cpus_ptr if SCA_USER flag set
+ */
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
}
static void
-__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
struct rq *rq = task_rq(p);
bool queued, running;
@@ -2552,7 +2568,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
*
* XXX do further audits, this smells like something putrid.
*/
- if (flags & SCA_MIGRATE_DISABLE)
+ if (ctx->flags & SCA_MIGRATE_DISABLE)
SCHED_WARN_ON(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
@@ -2571,7 +2587,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
if (running)
put_prev_task(rq, p);
- p->sched_class->set_cpus_allowed(p, new_mask, flags);
+ p->sched_class->set_cpus_allowed(p, ctx);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -2579,22 +2595,82 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
set_next_task(rq, p);
}
+/*
+ * Used for kthread_bind() and select_fallback_rq(), in both cases the user
+ * affinity (if any) should be destroyed too.
+ */
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
- __do_set_cpus_allowed(p, new_mask, 0);
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .user_mask = NULL,
+ .flags = SCA_USER, /* clear the user requested mask */
+ };
+ union cpumask_rcuhead {
+ cpumask_t cpumask;
+ struct rcu_head rcu;
+ };
+
+ __do_set_cpus_allowed(p, &ac);
+
+ /*
+ * Because this is called with p->pi_lock held, it is not possible
+ * to use kfree() here (when PREEMPT_RT=y), therefore punt to using
+ * kfree_rcu().
+ */
+ kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
+}
+
+static cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ /*
+ * See do_set_cpus_allowed() above for the rcu_head usage.
+ */
+ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
+
+ return kmalloc_node(size, GFP_KERNEL, node);
}
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
- if (!src->user_cpus_ptr)
+ cpumask_t *user_mask;
+ unsigned long flags;
+
+ /*
+ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
+ * may differ by now due to racing.
+ */
+ dst->user_cpus_ptr = NULL;
+
+ /*
+ * This check is racy and losing the race is a valid situation.
+ * It is not worth the extra overhead of taking the pi_lock on
+ * every fork/clone.
+ */
+ if (data_race(!src->user_cpus_ptr))
return 0;
- dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
- if (!dst->user_cpus_ptr)
+ user_mask = alloc_user_cpus_ptr(node);
+ if (!user_mask)
return -ENOMEM;
- cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ /*
+ * Use pi_lock to protect content of user_cpus_ptr
+ *
+ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
+ * do_set_cpus_allowed().
+ */
+ raw_spin_lock_irqsave(&src->pi_lock, flags);
+ if (src->user_cpus_ptr) {
+ swap(dst->user_cpus_ptr, user_mask);
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ }
+ raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+
+ if (unlikely(user_mask))
+ kfree(user_mask);
+
return 0;
}
@@ -2690,6 +2766,8 @@ void release_user_cpus_ptr(struct task_struct *p)
*/
static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
int dest_cpu, unsigned int flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
bool stop_pending, complete = false;
@@ -2832,8 +2910,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
* Called with both p->pi_lock and rq->lock held; drops both before returning.
*/
static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags,
+ struct affinity_context *ctx,
struct rq *rq,
struct rq_flags *rf)
__releases(rq->lock)
@@ -2842,7 +2919,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
bool kthread = p->flags & PF_KTHREAD;
- struct cpumask *user_mask = NULL;
unsigned int dest_cpu;
int ret = 0;
@@ -2862,7 +2938,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
cpu_valid_mask = cpu_online_mask;
}
- if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+ if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {
ret = -EINVAL;
goto out;
}
@@ -2871,18 +2947,21 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
*/
- if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
+ if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
- if (!(flags & SCA_MIGRATE_ENABLE)) {
- if (cpumask_equal(&p->cpus_mask, new_mask))
+ if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
+ if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
goto out;
+ }
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
- !cpumask_test_cpu(task_cpu(p), new_mask))) {
+ !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
ret = -EBUSY;
goto out;
}
@@ -2893,22 +2972,15 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
* for groups of tasks (ie. cpuset), so that load balancing is not
* immediately required to distribute the tasks within their new mask.
*/
- dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
- __do_set_cpus_allowed(p, new_mask, flags);
-
- if (flags & SCA_USER)
- user_mask = clear_user_cpus_ptr(p);
-
- ret = affine_move_task(rq, p, rf, dest_cpu, flags);
+ __do_set_cpus_allowed(p, ctx);
- kfree(user_mask);
-
- return ret;
+ return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
out:
task_rq_unlock(rq, p, rf);
@@ -2926,25 +2998,41 @@ out:
* call is not atomic; no spinlocks may be held.
*/
static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, u32 flags)
+ struct affinity_context *ctx)
{
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
- return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+ /*
+ * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
+ * flags are set.
+ */
+ if (p->user_cpus_ptr &&
+ !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
+ cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
+ ctx->new_mask = rq->scratch_mask;
+
+ return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
}
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- return __set_cpus_allowed_ptr(p, new_mask, 0);
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
+
+ return __set_cpus_allowed_ptr(p, &ac);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
/*
* Change a given task's CPU affinity to the intersection of its current
- * affinity mask and @subset_mask, writing the resulting mask to @new_mask
- * and pointing @p->user_cpus_ptr to a copy of the old mask.
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
+ * If user_cpus_ptr is defined, use it as the basis for restricting CPU
+ * affinity or use cpu_online_mask instead.
+ *
* If the resulting mask is empty, leave the affinity unchanged and return
* -EINVAL.
*/
@@ -2952,17 +3040,14 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p,
struct cpumask *new_mask,
const struct cpumask *subset_mask)
{
- struct cpumask *user_mask = NULL;
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
struct rq_flags rf;
struct rq *rq;
int err;
- if (!p->user_cpus_ptr) {
- user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
- if (!user_mask)
- return -ENOMEM;
- }
-
rq = task_rq_lock(p, &rf);
/*
@@ -2975,31 +3060,21 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p,
goto err_unlock;
}
- if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+ if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {
err = -EINVAL;
goto err_unlock;
}
- /*
- * We're about to butcher the task affinity, so keep track of what
- * the user asked for in case we're able to restore it later on.
- */
- if (user_mask) {
- cpumask_copy(user_mask, p->cpus_ptr);
- p->user_cpus_ptr = user_mask;
- }
-
- return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+ return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
err_unlock:
task_rq_unlock(rq, p, &rf);
- kfree(user_mask);
return err;
}
/*
* Restrict the CPU affinity of task @p so that it is a subset of
- * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+ * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the
* old affinity mask. If the resulting mask is empty, we warn and walk
* up the cpuset hierarchy until we find a suitable mask.
*/
@@ -3043,34 +3118,29 @@ out_free_mask:
}
static int
-__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
/*
* Restore the affinity of a task @p which was previously restricted by a
- * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
- * @p->user_cpus_ptr.
+ * call to force_compatible_cpus_allowed_ptr().
*
* It is the caller's responsibility to serialise this with any calls to
* force_compatible_cpus_allowed_ptr(@p).
*/
void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
{
- struct cpumask *user_mask = p->user_cpus_ptr;
- unsigned long flags;
+ struct affinity_context ac = {
+ .new_mask = task_user_cpus(p),
+ .flags = 0,
+ };
+ int ret;
/*
- * Try to restore the old affinity mask. If this fails, then
- * we free the mask explicitly to avoid it being inherited across
- * a subsequent fork().
+ * Try to restore the old affinity mask with __sched_setaffinity().
+ * Cpuset masking will be done there too.
*/
- if (!user_mask || !__sched_setaffinity(p, user_mask))
- return;
-
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- user_mask = clear_user_cpus_ptr(p);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
- kfree(user_mask);
+ ret = __sched_setaffinity(p, &ac);
+ WARN_ON_ONCE(ret);
}
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -3548,10 +3618,9 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
#else /* CONFIG_SMP */
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags)
+ struct affinity_context *ctx)
{
- return set_cpus_allowed_ptr(p, new_mask);
+ return set_cpus_allowed_ptr(p, ctx->new_mask);
}
static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
@@ -3561,6 +3630,11 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
return false;
}
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ return NULL;
+}
+
#endif /* !CONFIG_SMP */
static void
@@ -3603,14 +3677,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
/*
- * Mark the task runnable and perform wakeup-preemption.
+ * Mark the task runnable.
*/
-static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
+static inline void ttwu_do_wakeup(struct task_struct *p)
{
- check_preempt_curr(rq, p, wake_flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct rq_flags *rf)
+{
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
+
+ lockdep_assert_rq_held(rq);
+
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+#ifdef CONFIG_SMP
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
+ else
+#endif
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ activate_task(rq, p, en_flags);
+ check_preempt_curr(rq, p, wake_flags);
+
+ ttwu_do_wakeup(p);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
@@ -3640,31 +3739,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
}
-static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
-{
- int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
-
- lockdep_assert_rq_held(rq);
-
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
-
-#ifdef CONFIG_SMP
- if (wake_flags & WF_MIGRATED)
- en_flags |= ENQUEUE_MIGRATED;
- else
-#endif
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
- activate_task(rq, p, en_flags);
- ttwu_do_wakeup(rq, p, wake_flags, rf);
-}
-
/*
* Consider @p being inside a wait loop:
*
@@ -3698,9 +3772,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
- /* check_preempt_curr() may use rq clock */
- update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags, &rf);
+ if (!task_on_cpu(rq, p)) {
+ /*
+ * When on_rq && !on_cpu the task is preempted, see if
+ * it should preempt the task that is current now.
+ */
+ update_rq_clock(rq);
+ check_preempt_curr(rq, p, wake_flags);
+ }
+ ttwu_do_wakeup(p);
ret = 1;
}
__task_rq_unlock(rq, &rf);
@@ -3719,13 +3799,6 @@ void sched_ttwu_pending(void *arg)
if (!llist)
return;
- /*
- * rq::ttwu_pending racy indication of out-standing wakeups.
- * Races such that false-negatives are possible, since they
- * are shorter lived that false-positives would be.
- */
- WRITE_ONCE(rq->ttwu_pending, 0);
-
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -3739,6 +3812,17 @@ void sched_ttwu_pending(void *arg)
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
}
+ /*
+ * Must be after enqueueing at least once task such that
+ * idle_cpu() does not observe a false-negative -- if it does,
+ * it is possible for select_idle_siblings() to stack a number
+ * of tasks on this CPU during that window.
+ *
+ * It is ok to clear ttwu_pending when another task pending.
+ * We will receive IPI after local irq enabled and then enqueue it.
+ * Since now nr_running > 0, idle_cpu() will always get correct result.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
rq_unlock_irqrestore(rq, &rf);
}
@@ -4062,8 +4146,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
goto out;
trace_sched_waking(p);
- WRITE_ONCE(p->__state, TASK_RUNNING);
- trace_sched_wakeup(p);
+ ttwu_do_wakeup(p);
goto out;
}
@@ -4419,7 +4502,7 @@ static void reset_memory_tiering(void)
}
}
-int sysctl_numa_balancing(struct ctl_table *table, int write,
+static int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -4546,6 +4629,17 @@ static struct ctl_table sched_core_sysctls[] = {
.proc_handler = sysctl_sched_uclamp_handler,
},
#endif /* CONFIG_UCLAMP_TASK */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_numa_balancing,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_FOUR,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
{}
};
static int __init sched_core_sysctl_init(void)
@@ -5017,6 +5111,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
+ switch_mm_cid(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -5247,6 +5342,11 @@ bool single_task_running(void)
}
EXPORT_SYMBOL(single_task_running);
+unsigned long long nr_context_switches_cpu(int cpu)
+{
+ return cpu_rq(cpu)->nr_switches;
+}
+
unsigned long long nr_context_switches(void)
{
int i;
@@ -5469,7 +5569,9 @@ void scheduler_tick(void)
unsigned long thermal_pressure;
u64 resched_latency;
- arch_scale_freq_tick();
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ arch_scale_freq_tick();
+
sched_clock_tick();
rq_lock(rq, &rf);
@@ -5747,8 +5849,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
- if (panic_on_warn)
- panic("scheduling while atomic\n");
+ check_panic_on_warn("scheduling while atomic");
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
@@ -6172,7 +6273,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd)
{
int i;
- for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
if (i == cpu)
continue;
@@ -8106,7 +8207,7 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
#endif
static int
-__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
{
int retval;
cpumask_var_t cpus_allowed, new_mask;
@@ -8120,13 +8221,16 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
}
cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, mask, cpus_allowed);
+ cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+ ctx->new_mask = new_mask;
+ ctx->flags |= SCA_CHECK;
retval = dl_task_check_affinity(p, new_mask);
if (retval)
goto out_free_new_mask;
-again:
- retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+
+ retval = __set_cpus_allowed_ptr(p, ctx);
if (retval)
goto out_free_new_mask;
@@ -8137,7 +8241,24 @@ again:
* Just reset the cpumask to the cpuset's cpus_allowed.
*/
cpumask_copy(new_mask, cpus_allowed);
- goto again;
+
+ /*
+ * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+ * will restore the previous user_cpus_ptr value.
+ *
+ * In the unlikely event a previous user_cpus_ptr exists,
+ * we need to further restrict the mask to what is allowed
+ * by that old user_cpus_ptr.
+ */
+ if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+ bool empty = !cpumask_and(new_mask, new_mask,
+ ctx->user_mask);
+
+ if (WARN_ON_ONCE(empty))
+ cpumask_copy(new_mask, cpus_allowed);
+ }
+ __set_cpus_allowed_ptr(p, ctx);
+ retval = -EINVAL;
}
out_free_new_mask:
@@ -8149,6 +8270,8 @@ out_free_cpus_allowed:
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
+ struct affinity_context ac;
+ struct cpumask *user_mask;
struct task_struct *p;
int retval;
@@ -8183,7 +8306,27 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (retval)
goto out_put_task;
- retval = __sched_setaffinity(p, in_mask);
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
+ * alloc_user_cpus_ptr() returns NULL.
+ */
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else if (IS_ENABLED(CONFIG_SMP)) {
+ retval = -ENOMEM;
+ goto out_put_task;
+ }
+
+ ac = (struct affinity_context){
+ .new_mask = in_mask,
+ .user_mask = user_mask,
+ .flags = SCA_USER,
+ };
+
+ retval = __sched_setaffinity(p, &ac);
+ kfree(ac.user_mask);
+
out_put_task:
put_task_struct(p);
return retval;
@@ -8964,6 +9107,12 @@ void show_state_filter(unsigned int state_filter)
*/
void __init init_idle(struct task_struct *idle, int cpu)
{
+#ifdef CONFIG_SMP
+ struct affinity_context ac = (struct affinity_context) {
+ .new_mask = cpumask_of(cpu),
+ .flags = 0,
+ };
+#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -8988,7 +9137,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
*
* And since this is boot we can forgo the serialization.
*/
- set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
+ set_cpus_allowed_common(idle, &ac);
#endif
/*
* We're having a chicken and egg problem, even though we are
@@ -9775,6 +9924,7 @@ void __init sched_init(void)
rq->core_cookie = 0UL;
#endif
+ zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
set_load_weight(&init_task, false);
@@ -11228,3 +11378,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
{
trace_sched_update_nr_running_tp(rq, count);
}
+
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ mm_cid_put(mm, t->mm_cid);
+ t->mm_cid = -1;
+ t->mm_cid_active = 0;
+ local_irq_restore(flags);
+}
+
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ mm_cid_put(mm, t->mm_cid);
+ t->mm_cid = -1;
+ t->mm_cid_active = 0;
+ local_irq_restore(flags);
+}
+
+void sched_mm_cid_after_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ t->mm_cid = mm_cid_get(mm);
+ t->mm_cid_active = 1;
+ local_irq_restore(flags);
+ rseq_set_notify_resume(t);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+ WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
+ t->mm_cid_active = 1;
+}
+#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1207c78f85c1..5c840151f3bb 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -48,7 +48,6 @@ struct sugov_cpu {
unsigned long util;
unsigned long bw_dl;
- unsigned long max;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
@@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* sugov_iowait_apply() - Apply the IO boost to a CPU.
* @sg_cpu: the sugov data for the cpu to boost
* @time: the update time from the caller
+ * @max_cap: the max CPU capacity
*
* A CPU running a task which woken up after an IO operation can have its
* utilization boosted to speed up the completion of those IO operations.
@@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long max_cap)
{
unsigned long boost;
@@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
* sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+ boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
if (sg_cpu->util < boost)
sg_cpu->util = boost;
@@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
- u64 time, unsigned int flags)
+ u64 time, unsigned long max_cap,
+ unsigned int flags)
{
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
return false;
sugov_get_util(sg_cpu);
- sugov_iowait_apply(sg_cpu, time);
+ sugov_iowait_apply(sg_cpu, time, max_cap);
return true;
}
@@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned int cached_freq = sg_policy->cached_raw_freq;
+ unsigned long max_cap;
unsigned int next_f;
- if (!sugov_update_single_common(sg_cpu, time, flags))
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
- next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
+ next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
@@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
unsigned long prev_util = sg_cpu->util;
+ unsigned long max_cap;
/*
* Fall back to the "frequency" path if frequency invariance is not
@@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
return;
}
- if (!sugov_update_single_common(sg_cpu, time, flags))
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
/*
@@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
- map_util_perf(sg_cpu->util), sg_cpu->max);
+ map_util_perf(sg_cpu->util), max_cap);
sg_cpu->sg_policy->last_freq_update_time = time;
}
@@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned long util = 0, max = 1;
+ unsigned long util = 0, max_cap;
unsigned int j;
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
- unsigned long j_util, j_max;
sugov_get_util(j_sg_cpu);
- sugov_iowait_apply(j_sg_cpu, time);
- j_util = j_sg_cpu->util;
- j_max = j_sg_cpu->max;
+ sugov_iowait_apply(j_sg_cpu, time, max_cap);
- if (j_util * max > j_max * util) {
- util = j_util;
- max = j_max;
- }
+ util = max(j_sg_cpu->util, util);
}
- return get_next_freq(sg_policy, util, max);
+ return get_next_freq(sg_policy, util, max_cap);
}
static void
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 95fc77853743..af7952f12e6c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,10 @@
* Simple CPU accounting cgroup controller
*/
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ #include <asm/cputime.h>
+#endif
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9ae8f41e3372..71b24371a6f7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2485,8 +2485,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
}
static void set_cpus_allowed_dl(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags)
+ struct affinity_context *ctx)
{
struct root_domain *src_rd;
struct rq *rq;
@@ -2501,7 +2500,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
- if (!cpumask_intersects(src_rd->span, new_mask)) {
+ if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
struct dl_bw *src_dl_b;
src_dl_b = dl_bw_of(cpu_of(rq));
@@ -2515,7 +2514,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
- set_cpus_allowed_common(p, new_mask, flags);
+ set_cpus_allowed_common(p, ctx);
}
/* Assumes rq->lock is held */
@@ -2664,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (task_on_rq_queued(p) || task_current(rq, p)) {
+ if (!task_on_rq_queued(p))
+ return;
+
#ifdef CONFIG_SMP
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
- deadline_queue_pull_task(rq);
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ deadline_queue_pull_task(rq);
+ if (task_current(rq, p)) {
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
@@ -2682,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
-#else
+ } else {
/*
- * Again, we don't know if p has a earlier
- * or later deadline, so let's blindly set a
- * (maybe not needed) rescheduling point.
+ * Current may not be deadline in case p was throttled but we
+ * have just replenished it (e.g. rt_mutex_setprio()).
+ *
+ * Otherwise, if p was given an earlier deadline, reschedule.
*/
- resched_curr(rq);
-#endif /* CONFIG_SMP */
+ if (!dl_task(rq->curr) ||
+ dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
+ resched_curr(rq);
}
+#else
+ /*
+ * We don't know if p has a earlier or later deadline, so let's blindly
+ * set a (maybe not needed) rescheduling point.
+ */
+ resched_curr(rq);
+#endif
}
DEFINE_SCHED_CLASS(dl) = {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4a0b8bd941c..ff4dbbae3b10 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,6 +178,11 @@ int __weak arch_asym_cpu_priority(int cpu)
static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+#endif
+
#ifdef CONFIG_SYSCTL
static struct ctl_table sched_fair_sysctls[] = {
{
@@ -197,6 +202,16 @@ static struct ctl_table sched_fair_sysctls[] = {
.extra1 = SYSCTL_ONE,
},
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing_promote_rate_limit_MBps",
+ .data = &sysctl_numa_balancing_promote_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
{}
};
@@ -453,7 +468,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse)
return NULL;
}
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
+static inline struct sched_entity *parent_entity(const struct sched_entity *se)
{
return se->parent;
}
@@ -580,8 +595,8 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
-static inline bool entity_before(struct sched_entity *a,
- struct sched_entity *b)
+static inline bool entity_before(const struct sched_entity *a,
+ const struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
@@ -1094,9 +1109,6 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000;
/* The page with hint page fault latency < threshold in ms is considered hot */
unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
-/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
-unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
-
struct numa_group {
refcount_t refcount;
@@ -1792,7 +1804,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
- if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+ if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
if (READ_ONCE(rq->numa_migrate_on) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
@@ -1824,7 +1836,7 @@ static void task_numa_assign(struct task_numa_env *env,
int start = env->dst_cpu;
/* Find alternative idle CPU. */
- for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
if (cpu == env->best_cpu || !idle_cpu(cpu) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
continue;
@@ -2964,7 +2976,7 @@ static void task_numa_work(struct callback_head *work)
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
- if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
return;
/*
@@ -4280,14 +4292,16 @@ static inline unsigned long task_util_est(struct task_struct *p)
}
#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long uclamp_task_util(struct task_struct *p,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max)
{
- return clamp(task_util_est(p),
- uclamp_eff_value(p, UCLAMP_MIN),
- uclamp_eff_value(p, UCLAMP_MAX));
+ return clamp(task_util_est(p), uclamp_min, uclamp_max);
}
#else
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long uclamp_task_util(struct task_struct *p,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max)
{
return task_util_est(p);
}
@@ -4426,10 +4440,135 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
-static inline int task_fits_capacity(struct task_struct *p,
- unsigned long capacity)
+static inline int util_fits_cpu(unsigned long util,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max,
+ int cpu)
{
- return fits_capacity(uclamp_task_util(p), capacity);
+ unsigned long capacity_orig, capacity_orig_thermal;
+ unsigned long capacity = capacity_of(cpu);
+ bool fits, uclamp_max_fits;
+
+ /*
+ * Check if the real util fits without any uclamp boost/cap applied.
+ */
+ fits = fits_capacity(util, capacity);
+
+ if (!uclamp_is_used())
+ return fits;
+
+ /*
+ * We must use capacity_orig_of() for comparing against uclamp_min and
+ * uclamp_max. We only care about capacity pressure (by using
+ * capacity_of()) for comparing against the real util.
+ *
+ * If a task is boosted to 1024 for example, we don't want a tiny
+ * pressure to skew the check whether it fits a CPU or not.
+ *
+ * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+ * should fit a little cpu even if there's some pressure.
+ *
+ * Only exception is for thermal pressure since it has a direct impact
+ * on available OPP of the system.
+ *
+ * We honour it for uclamp_min only as a drop in performance level
+ * could result in not getting the requested minimum performance level.
+ *
+ * For uclamp_max, we can tolerate a drop in performance level as the
+ * goal is to cap the task. So it's okay if it's getting less.
+ */
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
+
+ /*
+ * We want to force a task to fit a cpu as implied by uclamp_max.
+ * But we do have some corner cases to cater for..
+ *
+ *
+ * C=z
+ * | ___
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | |
+ * | | | | | | | (util somewhere in this region)
+ * | | | | | | |
+ * | | | | | | |
+ * +----------------------------------------
+ * cpu0 cpu1 cpu2
+ *
+ * In the above example if a task is capped to a specific performance
+ * point, y, then when:
+ *
+ * * util = 80% of x then it does not fit on cpu0 and should migrate
+ * to cpu1
+ * * util = 80% of y then it is forced to fit on cpu1 to honour
+ * uclamp_max request.
+ *
+ * which is what we're enforcing here. A task always fits if
+ * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
+ * the normal upmigration rules should withhold still.
+ *
+ * Only exception is when we are on max capacity, then we need to be
+ * careful not to block overutilized state. This is so because:
+ *
+ * 1. There's no concept of capping at max_capacity! We can't go
+ * beyond this performance level anyway.
+ * 2. The system is being saturated when we're operating near
+ * max capacity, it doesn't make sense to block overutilized.
+ */
+ uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
+ uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
+ fits = fits || uclamp_max_fits;
+
+ /*
+ *
+ * C=z
+ * | ___ (region a, capped, util >= uclamp_max)
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
+ * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
+ * | | | | | | |
+ * | | | | | | | (region c, boosted, util < uclamp_min)
+ * +----------------------------------------
+ * cpu0 cpu1 cpu2
+ *
+ * a) If util > uclamp_max, then we're capped, we don't care about
+ * actual fitness value here. We only care if uclamp_max fits
+ * capacity without taking margin/pressure into account.
+ * See comment above.
+ *
+ * b) If uclamp_min <= util <= uclamp_max, then the normal
+ * fits_capacity() rules apply. Except we need to ensure that we
+ * enforce we remain within uclamp_max, see comment above.
+ *
+ * c) If util < uclamp_min, then we are boosted. Same as (b) but we
+ * need to take into account the boosted value fits the CPU without
+ * taking margin/pressure into account.
+ *
+ * Cases (a) and (b) are handled in the 'fits' variable already. We
+ * just need to consider an extra check for case (c) after ensuring we
+ * handle the case uclamp_min > uclamp_max.
+ */
+ uclamp_min = min(uclamp_min, uclamp_max);
+ if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ return -1;
+
+ return fits;
+}
+
+static inline int task_fits_cpu(struct task_struct *p, int cpu)
+{
+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ unsigned long util = task_util_est(p);
+ /*
+ * Return true only if the cpu fully fits the task requirements, which
+ * include the utilization but also the performance hints.
+ */
+ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4442,7 +4581,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
return;
}
- if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -4513,6 +4652,7 @@ static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
+ u64 sleep_time;
/*
* The 'current' period is already promised to the current tasks,
@@ -4542,8 +4682,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime -= thresh;
}
- /* ensure we never gain time by being placed backwards. */
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ /*
+ * Pull vruntime of the entity being placed to the base level of
+ * cfs_rq, to prevent boosting it if placed backwards. If the entity
+ * slept for a long time, don't even try to compare its vruntime with
+ * the base as it may be too far off and the comparison may get
+ * inversed due to s64 overflow.
+ */
+ sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
+ if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+ se->vruntime = vruntime;
+ else
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -4753,7 +4903,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
struct sched_entity *se;
s64 delta;
- ideal_runtime = sched_slice(cfs_rq, curr);
+ /*
+ * When many tasks blow up the sched_period; it is possible that
+ * sched_slice() reports unusually large results (when many tasks are
+ * very light for example). Therefore impose a maximum.
+ */
+ ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
+
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
resched_curr(rq_of(cfs_rq));
@@ -5318,22 +5474,105 @@ unthrottle_throttle:
resched_curr(rq);
}
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cursor, *tmp;
+ struct rq *rq = arg;
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+
+ /*
+ * Since we hold rq lock we're safe from concurrent manipulation of
+ * the CSD list. However, this RCU critical section annotates the
+ * fact that we pair with sched_free_group_rcu(), so that we cannot
+ * race with group being freed in the window between removing it
+ * from the list and advancing to the next entry in the list.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+ throttled_csd_list) {
+ list_del_init(&cursor->throttled_csd_list);
+
+ if (cfs_rq_throttled(cursor))
+ unthrottle_cfs_rq(cursor);
+ }
+
+ rcu_read_unlock();
+
+ rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ bool first;
+
+ if (rq == this_rq()) {
+ unthrottle_cfs_rq(cfs_rq);
+ return;
+ }
+
+ /* Already enqueued */
+ if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ return;
+
+ first = list_empty(&rq->cfsb_csd_list);
+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+ if (first)
+ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ lockdep_assert_rq_held(rq_of(cfs_rq));
+
+ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ cfs_rq->runtime_remaining <= 0))
+ return;
+
+ __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+ struct cfs_rq *local_unthrottle = NULL;
+ int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
+ bool throttled = false;
+ struct cfs_rq *cfs_rq;
+ struct rq_flags rf;
+ struct rq *rq;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
- struct rq *rq = rq_of(cfs_rq);
- struct rq_flags rf;
+ rq = rq_of(cfs_rq);
+
+ if (!remaining) {
+ throttled = true;
+ break;
+ }
rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
- /* By the above check, this should never be true */
+#ifdef CONFIG_SMP
+ /* Already queued for async unthrottle */
+ if (!list_empty(&cfs_rq->throttled_csd_list))
+ goto next;
+#endif
+
+ /* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
@@ -5347,16 +5586,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
cfs_rq->runtime_remaining += runtime;
/* we check whether we're throttled above */
- if (cfs_rq->runtime_remaining > 0)
- unthrottle_cfs_rq(cfs_rq);
+ if (cfs_rq->runtime_remaining > 0) {
+ if (cpu_of(rq) != this_cpu ||
+ SCHED_WARN_ON(local_unthrottle))
+ unthrottle_cfs_rq_async(cfs_rq);
+ else
+ local_unthrottle = cfs_rq;
+ } else {
+ throttled = true;
+ }
next:
rq_unlock_irqrestore(rq, &rf);
-
- if (!remaining)
- break;
}
rcu_read_unlock();
+
+ if (local_unthrottle) {
+ rq = cpu_rq(this_cpu);
+ rq_lock_irqsave(rq, &rf);
+ if (cfs_rq_throttled(local_unthrottle))
+ unthrottle_cfs_rq(local_unthrottle);
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ return throttled;
}
/*
@@ -5401,10 +5654,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- distribute_cfs_runtime(cfs_b);
+ throttled = distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
- throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/*
@@ -5681,6 +5932,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5697,12 +5951,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ int __maybe_unused i;
+
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
+
+ /*
+ * It is possible that we still have some cfs_rq's pending on a CSD
+ * list, though this race is very rare. In order for this to occur, we
+ * must have raced with the last task leaving the group while there
+ * exist throttled cfs_rq(s), and the period_timer must have queued the
+ * CSD item but the remote cpu has not yet processed it. To handle this,
+ * we can simply flush all pending CSD work inline here. We're
+ * guaranteed at this point that no additional cfs_rq of this group can
+ * join a CSD list.
+ */
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ if (list_empty(&rq->cfsb_csd_list))
+ continue;
+
+ local_irq_save(flags);
+ __cfsb_csd_unthrottle(rq);
+ local_irq_restore(flags);
+ }
+#endif
}
/*
@@ -5862,7 +6142,11 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
- return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
+ unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+ /* Return true only if the utilization doesn't fit CPU's capacity */
+ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
static inline void update_overutilized_status(struct rq *rq)
@@ -6654,36 +6938,62 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
- unsigned long task_util, best_cap = 0;
+ unsigned long task_util, util_min, util_max, best_cap = 0;
+ int fits, best_fits = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- task_util = uclamp_task_util(p);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
- for_each_cpu_wrap(cpu, cpus, target) {
+ for_each_cpu_wrap(cpu, cpus, target + 1) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (fits_capacity(task_util, cpu_cap))
+
+ fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+
+ /* This CPU fits with all requirements */
+ if (fits > 0)
return cpu;
+ /*
+ * Only the min performance hint (i.e. uclamp_min) doesn't fit.
+ * Look for the CPU with best capacity.
+ */
+ else if (fits < 0)
+ cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
- if (cpu_cap > best_cap) {
+ /*
+ * First, select CPU which fits better (-1 being better than 0).
+ * Then, select the one with best capacity at same level.
+ */
+ if ((fits < best_fits) ||
+ ((fits == best_fits) && (cpu_cap > best_cap))) {
best_cap = cpu_cap;
best_cpu = cpu;
+ best_fits = fits;
}
}
return best_cpu;
}
-static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
+static inline bool asym_fits_cpu(unsigned long util,
+ unsigned long util_min,
+ unsigned long util_max,
+ int cpu)
{
if (sched_asym_cpucap_active())
- return fits_capacity(task_util, capacity_of(cpu));
+ /*
+ * Return true only if the cpu fully fits the task requirements
+ * which include the utilization and the performance hints.
+ */
+ return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
return true;
}
@@ -6695,7 +7005,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
bool has_idle_core = false;
struct sched_domain *sd;
- unsigned long task_util;
+ unsigned long task_util, util_min, util_max;
int i, recent_used_cpu;
/*
@@ -6704,7 +7014,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
- task_util = uclamp_task_util(p);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
}
/*
@@ -6713,7 +7025,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
lockdep_assert_irqs_disabled();
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
- asym_fits_capacity(task_util, target))
+ asym_fits_cpu(task_util, util_min, util_max, target))
return target;
/*
@@ -6721,7 +7033,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
- asym_fits_capacity(task_util, prev))
+ asym_fits_cpu(task_util, util_min, util_max, prev))
return prev;
/*
@@ -6736,7 +7048,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
in_task() &&
prev == smp_processor_id() &&
this_rq()->nr_running <= 1 &&
- asym_fits_capacity(task_util, prev)) {
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
return prev;
}
@@ -6748,7 +7060,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
- asym_fits_capacity(task_util, recent_used_cpu)) {
+ asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
return recent_used_cpu;
}
@@ -7044,8 +7356,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
+ unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
+ int prev_fits = -1, best_fits = -1;
+ unsigned long best_thermal_cap = 0;
+ unsigned long prev_thermal_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
@@ -7068,17 +7385,20 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
target = prev_cpu;
sync_entity_load_avg(&p->se);
- if (!task_util_est(p))
+ if (!uclamp_task_util(p, p_util_min, p_util_max))
goto unlock;
eenv_task_busy_time(&eenv, p, prev_cpu);
for (; pd; pd = pd->next) {
+ unsigned long util_min = p_util_min, util_max = p_util_max;
unsigned long cpu_cap, cpu_thermal_cap, util;
unsigned long cur_delta, max_spare_cap = 0;
- bool compute_prev_delta = false;
+ unsigned long rq_util_min, rq_util_max;
+ unsigned long prev_spare_cap = 0;
int max_spare_cap_cpu = -1;
unsigned long base_energy;
+ int fits, max_fits = -1;
cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -7094,6 +7414,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
eenv.pd_cap = 0;
for_each_cpu(cpu, cpus) {
+ struct rq *rq = cpu_rq(cpu);
+
eenv.pd_cap += cpu_thermal_cap;
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
@@ -7112,26 +7434,45 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* much capacity we can get out of the CPU; this is
* aligned with sched_cpu_util().
*/
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
- if (!fits_capacity(util, cpu_cap))
+ if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. Ie: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
+
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
+ }
+
+ fits = util_fits_cpu(util, util_min, util_max, cpu);
+ if (!fits)
continue;
lsub_positive(&cpu_cap, util);
if (cpu == prev_cpu) {
/* Always use prev_cpu as a candidate. */
- compute_prev_delta = true;
- } else if (cpu_cap > max_spare_cap) {
+ prev_spare_cap = cpu_cap;
+ prev_fits = fits;
+ } else if ((fits > max_fits) ||
+ ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
/*
* Find the CPU with the maximum spare capacity
- * in the performance domain.
+ * among the remaining CPUs in the performance
+ * domain.
*/
max_spare_cap = cpu_cap;
max_spare_cap_cpu = cpu;
+ max_fits = fits;
}
}
- if (max_spare_cap_cpu < 0 && !compute_prev_delta)
+ if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
continue;
eenv_pd_busy_time(&eenv, cpus, p);
@@ -7139,33 +7480,57 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
base_energy = compute_energy(&eenv, pd, cpus, p, -1);
/* Evaluate the energy impact of using prev_cpu. */
- if (compute_prev_delta) {
+ if (prev_spare_cap > 0) {
prev_delta = compute_energy(&eenv, pd, cpus, p,
prev_cpu);
/* CPU utilization has changed */
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
+ prev_thermal_cap = cpu_thermal_cap;
best_delta = min(best_delta, prev_delta);
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */
- if (max_spare_cap_cpu >= 0) {
+ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
+ /* Current best energy cpu fits better */
+ if (max_fits < best_fits)
+ continue;
+
+ /*
+ * Both don't fit performance hint (i.e. uclamp_min)
+ * but best energy cpu has better capacity.
+ */
+ if ((max_fits < 0) &&
+ (cpu_thermal_cap <= best_thermal_cap))
+ continue;
+
cur_delta = compute_energy(&eenv, pd, cpus, p,
max_spare_cap_cpu);
/* CPU utilization has changed */
if (cur_delta < base_energy)
goto unlock;
cur_delta -= base_energy;
- if (cur_delta < best_delta) {
- best_delta = cur_delta;
- best_energy_cpu = max_spare_cap_cpu;
- }
+
+ /*
+ * Both fit for the task but best energy cpu has lower
+ * energy impact.
+ */
+ if ((max_fits > 0) && (best_fits > 0) &&
+ (cur_delta >= best_delta))
+ continue;
+
+ best_delta = cur_delta;
+ best_energy_cpu = max_spare_cap_cpu;
+ best_fits = max_fits;
+ best_thermal_cap = cpu_thermal_cap;
}
}
rcu_read_unlock();
- if (best_delta < prev_delta)
+ if ((best_fits > prev_fits) ||
+ ((best_fits > 0) && (best_delta < prev_delta)) ||
+ ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
target = best_energy_cpu;
return target;
@@ -8276,7 +8641,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_misfit:
/* This is not a misfit task */
- if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ if (task_fits_cpu(p, env->src_cpu))
goto next;
env->imbalance = 0;
@@ -9281,6 +9646,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
memset(sgs, 0, sizeof(*sgs));
+ /* Assume that task can't fit any CPU of the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY)
+ sgs->group_misfit_task_load = 1;
+
for_each_cpu(i, sched_group_span(group)) {
struct rq *rq = cpu_rq(i);
unsigned int local;
@@ -9300,12 +9669,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;
- }
+ /* Check if task fits in the CPU */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ sgs->group_misfit_task_load &&
+ task_fits_cpu(p, i))
+ sgs->group_misfit_task_load = 0;
- /* Check if task fits in the group */
- if (sd->flags & SD_ASYM_CPUCAPACITY &&
- !task_fits_capacity(p, group->sgc->max_capacity)) {
- sgs->group_misfit_task_load = 1;
}
sgs->group_capacity = group->sgc->capacity;
@@ -9898,24 +10267,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
-
- local = &sds.local_stat;
- busiest = &sds.busiest_stat;
-
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest)
goto out_balanced;
+ busiest = &sds.busiest_stat;
+
/* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;
+ if (sched_energy_enabled()) {
+ struct root_domain *rd = env->dst_rq->rd;
+
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+ goto out_balanced;
+ }
+
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
goto force_balance;
@@ -9928,6 +10296,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
+ local = &sds.local_stat;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -11491,7 +11860,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
/*
* se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
*/
-static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
+ bool forceidle)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -11516,11 +11886,12 @@ void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
}
-bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi)
{
struct rq *rq = task_rq(a);
- struct sched_entity *sea = &a->se;
- struct sched_entity *seb = &b->se;
+ const struct sched_entity *sea = &a->se;
+ const struct sched_entity *seb = &b->se;
struct cfs_rq *cfs_rqa;
struct cfs_rq *cfs_rqb;
s64 delta;
@@ -12237,6 +12608,11 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+ INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
}
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d..e9ef66be2870 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -51,18 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup);
static noinline int __cpuidle cpu_idle_poll(void)
{
+ instrumentation_begin();
trace_cpu_idle(0, smp_processor_id());
stop_critical_timings();
- ct_idle_enter();
- local_irq_enable();
+ ct_cpuidle_enter();
+ raw_local_irq_enable();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
+ raw_local_irq_disable();
- ct_idle_exit();
+ ct_cpuidle_exit();
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ local_irq_enable();
+ instrumentation_end();
return 1;
}
@@ -75,7 +79,6 @@ void __weak arch_cpu_idle_dead(void) { }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
- raw_local_irq_enable();
}
/**
@@ -85,44 +88,20 @@ void __weak arch_cpu_idle(void)
*/
void __cpuidle default_idle_call(void)
{
- if (current_clr_polling_and_test()) {
- local_irq_enable();
- } else {
-
+ instrumentation_begin();
+ if (!current_clr_polling_and_test()) {
trace_cpu_idle(1, smp_processor_id());
stop_critical_timings();
- /*
- * arch_cpu_idle() is supposed to enable IRQs, however
- * we can't do that because of RCU and tracing.
- *
- * Trace IRQs enable here, then switch off RCU, and have
- * arch_cpu_idle() use raw_local_irq_enable(). Note that
- * ct_idle_enter() relies on lockdep IRQ state, so switch that
- * last -- this is very similar to the entry code.
- */
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare();
- ct_idle_enter();
- lockdep_hardirqs_on(_THIS_IP_);
-
+ ct_cpuidle_enter();
arch_cpu_idle();
-
- /*
- * OK, so IRQs are enabled here, but RCU needs them disabled to
- * turn itself back on.. funny thing is that disabling IRQs
- * will cause tracing, which needs RCU. Jump through hoops to
- * make it 'work'.
- */
- raw_local_irq_disable();
- lockdep_hardirqs_off(_THIS_IP_);
- ct_idle_exit();
- lockdep_hardirqs_on(_THIS_IP_);
- raw_local_irq_enable();
+ ct_cpuidle_exit();
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
+ local_irq_enable();
+ instrumentation_end();
}
static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 0c5be7ebb1dc..2ad881d07752 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -159,7 +159,8 @@
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
- | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
+ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ | MEMBARRIER_CMD_GET_REGISTRATIONS)
static void ipi_mb(void *info)
{
@@ -540,6 +541,40 @@ static int membarrier_register_private_expedited(int flags)
return 0;
}
+static int membarrier_get_registrations(void)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int registrations_mask = 0, membarrier_state, i;
+ static const int states[] = {
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED |
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY
+ };
+ static const int registration_cmds[] = {
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
+ };
+ BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds));
+
+ membarrier_state = atomic_read(&mm->membarrier_state);
+ for (i = 0; i < ARRAY_SIZE(states); ++i) {
+ if (membarrier_state & states[i]) {
+ registrations_mask |= registration_cmds[i];
+ membarrier_state &= ~states[i];
+ }
+ }
+ WARN_ON_ONCE(membarrier_state != 0);
+ return registrations_mask;
+}
+
/**
* sys_membarrier - issue memory barriers on a set of threads
* @cmd: Takes command values defined in enum membarrier_cmd.
@@ -623,6 +658,8 @@ SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
+ case MEMBARRIER_CMD_GET_REGISTRATIONS:
+ return membarrier_get_registrations();
default:
return -EINVAL;
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ee2ecc081422..02e011cabe91 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -189,6 +189,7 @@ static void group_init(struct psi_group *group)
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
+ atomic_set(&group->poll_scheduled, 0);
mutex_init(&group->trigger_lock);
INIT_LIST_HEAD(&group->triggers);
group->poll_min_period = U32_MAX;
@@ -242,6 +243,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ int current_cpu = raw_smp_processor_id();
+ unsigned int tasks[NR_PSI_TASK_COUNTS];
u64 now, state_start;
enum psi_states s;
unsigned int seq;
@@ -256,6 +259,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
+ if (cpu == current_cpu)
+ memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
} while (read_seqcount_retry(&groupc->seq, seq));
/* Calculate state time deltas against the previous snapshot */
@@ -280,6 +285,28 @@ static void get_recent_times(struct psi_group *group, int cpu,
if (delta)
*pchanged_states |= (1 << s);
}
+
+ /*
+ * When collect_percpu_times() from the avgs_work, we don't want to
+ * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
+ * this avgs_work is never IDLE, cause avgs_work can't be shut off.
+ * So for the current CPU, we need to re-arm avgs_work only when
+ * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
+ * we can just check PSI_NONIDLE delta.
+ */
+ if (current_work() == &group->avgs_work.work) {
+ bool reschedule;
+
+ if (cpu == current_cpu)
+ reschedule = tasks[NR_RUNNING] +
+ tasks[NR_IOWAIT] +
+ tasks[NR_MEMSTALL] > 1;
+ else
+ reschedule = *pchanged_states & (1 << PSI_NONIDLE);
+
+ if (reschedule)
+ *pchanged_states |= PSI_STATE_RESCHEDULE;
+ }
}
static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -415,7 +442,6 @@ static void psi_avgs_work(struct work_struct *work)
struct delayed_work *dwork;
struct psi_group *group;
u32 changed_states;
- bool nonidle;
u64 now;
dwork = to_delayed_work(work);
@@ -426,7 +452,6 @@ static void psi_avgs_work(struct work_struct *work)
now = sched_clock();
collect_percpu_times(group, PSI_AVGS, &changed_states);
- nonidle = changed_states & (1 << PSI_NONIDLE);
/*
* If there is task activity, periodically fold the per-cpu
* times and feed samples into the running averages. If things
@@ -437,7 +462,7 @@ static void psi_avgs_work(struct work_struct *work)
if (now >= group->avg_next_update)
group->avg_next_update = update_averages(group, now);
- if (nonidle) {
+ if (changed_states & PSI_STATE_RESCHEDULE) {
schedule_delayed_work(dwork, nsecs_to_jiffies(
group->avg_next_update - now) + 1);
}
@@ -539,10 +564,12 @@ static u64 update_triggers(struct psi_group *group, u64 now)
/* Calculate growth since last update */
growth = window_update(&t->win, now, total[t->state]);
- if (growth < t->threshold)
- continue;
+ if (!t->pending_event) {
+ if (growth < t->threshold)
+ continue;
- t->pending_event = true;
+ t->pending_event = true;
+ }
}
/* Limit event signaling to once per window */
if (now < t->last_event_time + t->win.size)
@@ -563,18 +590,17 @@ static u64 update_triggers(struct psi_group *group, u64 now)
return now + group->poll_min_period;
}
-/* Schedule polling if it's not already scheduled. */
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
+/* Schedule polling if it's not already scheduled or forced. */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
+ bool force)
{
struct task_struct *task;
/*
- * Do not reschedule if already scheduled.
- * Possible race with a timer scheduled after this check but before
- * mod_timer below can be tolerated because group->polling_next_update
- * will keep updates on schedule.
+ * atomic_xchg should be called even when !force to provide a
+ * full memory barrier (see the comment inside psi_poll_work).
*/
- if (timer_pending(&group->poll_timer))
+ if (atomic_xchg(&group->poll_scheduled, 1) && !force)
return;
rcu_read_lock();
@@ -586,12 +612,15 @@ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
*/
if (likely(task))
mod_timer(&group->poll_timer, jiffies + delay);
+ else
+ atomic_set(&group->poll_scheduled, 0);
rcu_read_unlock();
}
static void psi_poll_work(struct psi_group *group)
{
+ bool force_reschedule = false;
u32 changed_states;
u64 now;
@@ -599,6 +628,43 @@ static void psi_poll_work(struct psi_group *group)
now = sched_clock();
+ if (now > group->polling_until) {
+ /*
+ * We are either about to start or might stop polling if no
+ * state change was recorded. Resetting poll_scheduled leaves
+ * a small window for psi_group_change to sneak in and schedule
+ * an immediate poll_work before we get to rescheduling. One
+ * potential extra wakeup at the end of the polling window
+ * should be negligible and polling_next_update still keeps
+ * updates correctly on schedule.
+ */
+ atomic_set(&group->poll_scheduled, 0);
+ /*
+ * A task change can race with the poll worker that is supposed to
+ * report on it. To avoid missing events, ensure ordering between
+ * poll_scheduled and the task state accesses, such that if the poll
+ * worker misses the state update, the task change is guaranteed to
+ * reschedule the poll worker:
+ *
+ * poll worker:
+ * atomic_set(poll_scheduled, 0)
+ * smp_mb()
+ * LOAD states
+ *
+ * task change:
+ * STORE states
+ * if atomic_xchg(poll_scheduled, 1) == 0:
+ * schedule poll worker
+ *
+ * The atomic_xchg() implies a full barrier.
+ */
+ smp_mb();
+ } else {
+ /* Polling window is not over, keep rescheduling */
+ force_reschedule = true;
+ }
+
+
collect_percpu_times(group, PSI_POLL, &changed_states);
if (changed_states & group->poll_states) {
@@ -624,7 +690,8 @@ static void psi_poll_work(struct psi_group *group)
group->polling_next_update = update_triggers(group, now);
psi_schedule_poll_work(group,
- nsecs_to_jiffies(group->polling_next_update - now) + 1);
+ nsecs_to_jiffies(group->polling_next_update - now) + 1,
+ force_reschedule);
out:
mutex_unlock(&group->trigger_lock);
@@ -785,7 +852,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
write_seqcount_end(&groupc->seq);
if (state_mask & group->poll_states)
- psi_schedule_poll_work(group, 1);
+ psi_schedule_poll_work(group, 1, false);
if (wake_clock && !delayed_work_pending(&group->avgs_work))
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
@@ -939,7 +1006,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
write_seqcount_end(&groupc->seq);
if (group->poll_states & (1 << PSI_IRQ_FULL))
- psi_schedule_poll_work(group, 1);
+ psi_schedule_poll_work(group, 1, false);
} while ((group = group->parent));
}
#endif
@@ -1276,10 +1343,11 @@ void psi_trigger_destroy(struct psi_trigger *t)
group = t->group;
/*
- * Wakeup waiters to stop polling. Can happen if cgroup is deleted
- * from under a polling process.
+ * Wakeup waiters to stop polling and clear the queue to prevent it from
+ * being accessed later. Can happen if cgroup is deleted from under a
+ * polling process.
*/
- wake_up_interruptible(&t->event_wait);
+ wake_up_pollfree(&t->event_wait);
mutex_lock(&group->trigger_lock);
@@ -1325,6 +1393,7 @@ void psi_trigger_destroy(struct psi_trigger *t)
* can no longer be found through group->poll_task.
*/
kthread_stop(task_to_destroy);
+ atomic_set(&group->poll_scheduled, 0);
}
kfree(t);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ed2a47e4ddae..0a11f44adee5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1777,6 +1777,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
+ if (SCHED_WARN_ON(list_empty(queue)))
+ return NULL;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
return next;
@@ -1789,7 +1791,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
do {
rt_se = pick_next_rt_entity(rt_rq);
- BUG_ON(!rt_se);
+ if (unlikely(!rt_se))
+ return NULL;
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4a20046e586..3e8df6d31c1e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -248,7 +248,7 @@ static inline void update_avg(u64 *avg, u64 sample)
#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
-static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
+static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
@@ -260,8 +260,8 @@ static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
/*
* Tells if entity @a should preempt entity @b.
*/
-static inline bool
-dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+static inline bool dl_entity_preempt(const struct sched_dl_entity *a,
+ const struct sched_dl_entity *b)
{
return dl_entity_is_special(a) ||
dl_time_before(a->deadline, b->deadline);
@@ -645,6 +645,9 @@ struct cfs_rq {
int throttled;
int throttle_count;
struct list_head throttled_list;
+#ifdef CONFIG_SMP
+ struct list_head throttled_csd_list;
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1150,6 +1153,14 @@ struct rq {
unsigned int core_forceidle_occupation;
u64 core_forceidle_start;
#endif
+
+ /* Scratch cpumask to be temporarily used under rq_lock */
+ cpumask_var_t scratch_mask;
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+ call_single_data_t cfsb_csd;
+ struct list_head cfsb_csd_list;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1232,7 +1243,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
return &rq->__lock;
}
-bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi);
+bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool fi);
/*
* Helpers to check if the CPU's core cookie matches with the task's cookie
@@ -1411,7 +1423,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
}
/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
{
return se->cfs_rq;
}
@@ -1424,19 +1436,16 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
#else
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- return container_of(se, struct task_struct, se);
-}
+#define task_of(_se) container_of(_se, struct task_struct, se)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p)
{
return &task_rq(p)->cfs;
}
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
{
- struct task_struct *p = task_of(se);
+ const struct task_struct *p = task_of(se);
struct rq *rq = task_rq(p);
return &rq->cfs;
@@ -1877,6 +1886,13 @@ static inline void dirty_sched_domain_sysctl(int cpu)
#endif
extern int sched_update_scaling(void);
+
+static inline const struct cpumask *task_user_cpus(struct task_struct *p)
+{
+ if (!p->user_cpus_ptr)
+ return cpu_possible_mask; /* &init_task.cpus_mask */
+ return p->user_cpus_ptr;
+}
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -2144,6 +2160,12 @@ extern const u32 sched_prio_to_wmult[40];
#define RETRY_TASK ((void *)-1UL)
+struct affinity_context {
+ const struct cpumask *new_mask;
+ struct cpumask *user_mask;
+ unsigned int flags;
+};
+
struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
@@ -2172,9 +2194,7 @@ struct sched_class {
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
- void (*set_cpus_allowed)(struct task_struct *p,
- const struct cpumask *newmask,
- u32 flags);
+ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
@@ -2285,7 +2305,7 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
static inline struct task_struct *get_push_task(struct rq *rq)
{
@@ -2979,6 +2999,23 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
#ifdef CONFIG_UCLAMP_TASK
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+static inline unsigned long uclamp_rq_get(struct rq *rq,
+ enum uclamp_id clamp_id)
+{
+ return READ_ONCE(rq->uclamp[clamp_id].value);
+}
+
+static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int value)
+{
+ WRITE_ONCE(rq->uclamp[clamp_id].value, value);
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
+}
+
/**
* uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
* @rq: The rq to clamp against. Must not be NULL.
@@ -3014,12 +3051,12 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
* Ignore last runnable task's max clamp, as this task will
* reset it. Similarly, no need to read the rq's min clamp.
*/
- if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ if (uclamp_rq_is_idle(rq))
goto out;
}
- min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
- max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
+ min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN));
+ max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX));
out:
/*
* Since CPU's {min,max}_util clamps are MAX aggregated considering
@@ -3060,6 +3097,15 @@ static inline bool uclamp_is_used(void)
return static_branch_likely(&sched_uclamp_used);
}
#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned long uclamp_eff_value(struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
static inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
struct task_struct *p)
@@ -3073,6 +3119,25 @@ static inline bool uclamp_is_used(void)
{
return false;
}
+
+static inline unsigned long uclamp_rq_get(struct rq *rq,
+ enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int value)
+{
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return false;
+}
#endif /* CONFIG_UCLAMP_TASK */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
@@ -3183,4 +3248,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
cgroup_account_cputime(curr, delta_exec);
}
+#ifdef CONFIG_SCHED_MM_CID
+static inline int __mm_cid_get(struct mm_struct *mm)
+{
+ struct cpumask *cpumask;
+ int cid;
+
+ cpumask = mm_cidmask(mm);
+ cid = cpumask_first_zero(cpumask);
+ if (cid >= nr_cpu_ids)
+ return -1;
+ __cpumask_set_cpu(cid, cpumask);
+ return cid;
+}
+
+static inline void mm_cid_put(struct mm_struct *mm, int cid)
+{
+ lockdep_assert_irqs_disabled();
+ if (cid < 0)
+ return;
+ raw_spin_lock(&mm->cid_lock);
+ __cpumask_clear_cpu(cid, mm_cidmask(mm));
+ raw_spin_unlock(&mm->cid_lock);
+}
+
+static inline int mm_cid_get(struct mm_struct *mm)
+{
+ int ret;
+
+ lockdep_assert_irqs_disabled();
+ raw_spin_lock(&mm->cid_lock);
+ ret = __mm_cid_get(mm);
+ raw_spin_unlock(&mm->cid_lock);
+ return ret;
+}
+
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
+{
+ if (prev->mm_cid_active) {
+ if (next->mm_cid_active && next->mm == prev->mm) {
+ /*
+ * Context switch between threads in same mm, hand over
+ * the mm_cid from prev to next.
+ */
+ next->mm_cid = prev->mm_cid;
+ prev->mm_cid = -1;
+ return;
+ }
+ mm_cid_put(prev->mm, prev->mm_cid);
+ prev->mm_cid = -1;
+ }
+ if (next->mm_cid_active)
+ next->mm_cid = mm_cid_get(next->mm);
+}
+
+#else
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+#endif
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 84a188913cc9..38f3698f5e5b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -128,11 +128,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
if (p->in_memstall)
set |= TSK_MEMSTALL_RUNNING;
- if (!wakeup || p->sched_psi_wake_requeue) {
+ if (!wakeup) {
if (p->in_memstall)
set |= TSK_MEMSTALL;
- if (p->sched_psi_wake_requeue)
- p->sched_psi_wake_requeue = 0;
} else {
if (p->in_iowait)
clear |= TSK_IOWAIT;
@@ -143,8 +141,6 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
static inline void psi_dequeue(struct task_struct *p, bool sleep)
{
- int clear = TSK_RUNNING;
-
if (static_branch_likely(&psi_disabled))
return;
@@ -157,10 +153,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
if (sleep)
return;
- if (p->in_memstall)
- clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
-
- psi_task_change(p, clear, 0);
+ psi_task_change(p, p->psi_flags, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -172,19 +165,12 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
* deregister its sleep-persistent psi states from the old
* queue, and let psi_enqueue() know it has to requeue.
*/
- if (unlikely(p->in_iowait || p->in_memstall)) {
+ if (unlikely(p->psi_flags)) {
struct rq_flags rf;
struct rq *rq;
- int clear = 0;
-
- if (p->in_iowait)
- clear |= TSK_IOWAIT;
- if (p->in_memstall)
- clear |= TSK_MEMSTALL;
rq = __task_rq_lock(p, &rf);
- psi_task_change(p, clear, 0);
- p->sched_psi_wake_requeue = 1;
+ psi_task_change(p, p->psi_flags, 0);
__task_rq_unlock(rq, &rf);
}
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..051aaf65c749 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,8 @@
* Scheduler topology setup/handling methods
*/
+#include <linux/bsearch.h>
+
DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */
@@ -578,7 +580,7 @@ out:
*/
struct root_domain def_root_domain;
-void init_defrootdomain(void)
+void __init init_defrootdomain(void)
{
init_rootdomain(&def_root_domain);
@@ -2067,6 +2069,99 @@ unlock:
return found;
}
+struct __cmp_key {
+ const struct cpumask *cpus;
+ struct cpumask ***masks;
+ int node;
+ int cpu;
+ int w;
+};
+
+static int hop_cmp(const void *a, const void *b)
+{
+ struct cpumask **prev_hop, **cur_hop = *(struct cpumask ***)b;
+ struct __cmp_key *k = (struct __cmp_key *)a;
+
+ if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
+ return 1;
+
+ if (b == k->masks) {
+ k->w = 0;
+ return 0;
+ }
+
+ prev_hop = *((struct cpumask ***)b - 1);
+ k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]);
+ if (k->w <= k->cpu)
+ return 0;
+
+ return -1;
+}
+
+/*
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
+ * closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: Nth cpu to find
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+ struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
+ struct cpumask ***hop_masks;
+ int hop, ret = nr_cpu_ids;
+
+ rcu_read_lock();
+
+ k.masks = rcu_dereference(sched_domains_numa_masks);
+ if (!k.masks)
+ goto unlock;
+
+ hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ hop = hop_masks - k.masks;
+
+ ret = hop ?
+ cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
+ cpumask_nth_and(cpu, cpus, k.masks[0][node]);
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
+
+/**
+ * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
+ * @node
+ * @node: The node to count hops from.
+ * @hops: Include CPUs up to that many hops away. 0 means local node.
+ *
+ * Return: On success, a pointer to a cpumask of CPUs at most @hops away from
+ * @node, an error value otherwise.
+ *
+ * Requires rcu_lock to be held. Returned cpumask is only valid within that
+ * read-side section, copy it if required beyond that.
+ *
+ * Note that not all hops are equal in distance; see sched_init_numa() for how
+ * distances and masks are handled.
+ * Also note that this is a reflection of sched_domains_numa_masks, which may change
+ * during the lifetime of the system (offline nodes are taken out of the masks).
+ */
+const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
+{
+ struct cpumask ***masks;
+
+ if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
+ return ERR_PTR(-EINVAL);
+
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ return ERR_PTR(-EBUSY);
+
+ return masks[hops][node];
+}
+EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
+
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -2451,7 +2546,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* Set up scheduler domains and groups. For now this just excludes isolated
* CPUs, but could be used to exclude other special cases in the future.
*/
-int sched_init_domains(const struct cpumask *cpu_map)
+int __init sched_init_domains(const struct cpumask *cpu_map)
{
int err;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9860bb9a847c..133b74730738 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -121,11 +121,12 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
return nr_exclusive;
}
-static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
wait_queue_entry_t bookmark;
+ int remaining = nr_exclusive;
bookmark.flags = 0;
bookmark.private = NULL;
@@ -134,10 +135,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
do {
spin_lock_irqsave(&wq_head->lock, flags);
- nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+ remaining = __wake_up_common(wq_head, mode, remaining,
wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
} while (bookmark.flags & WQ_FLAG_BOOKMARK);
+
+ return nr_exclusive - remaining;
}
/**
@@ -147,13 +150,14 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*
- * If this function wakes up a task, it executes a full memory barrier before
- * accessing the task state.
+ * If this function wakes up a task, it executes a full memory barrier
+ * before accessing the task state. Returns the number of exclusive
+ * tasks that were awaken.
*/
-void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, void *key)
+int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, void *key)
{
- __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
+ return __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
diff --git a/kernel/scs.c b/kernel/scs.c
index b7e1b096d906..d7809affe740 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -12,6 +12,10 @@
#include <linux/vmalloc.h>
#include <linux/vmstat.h>
+#ifdef CONFIG_DYNAMIC_SCS
+DEFINE_STATIC_KEY_FALSE(dynamic_scs_enabled);
+#endif
+
static void __scs_account(void *s, int account)
{
struct page *scs_page = vmalloc_to_page(s);
@@ -101,14 +105,20 @@ static int scs_cleanup(unsigned int cpu)
void __init scs_init(void)
{
+ if (!scs_is_enabled())
+ return;
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL,
scs_cleanup);
}
int scs_prepare(struct task_struct *tsk, int node)
{
- void *s = scs_alloc(node);
+ void *s;
+ if (!scs_is_enabled())
+ return 0;
+
+ s = scs_alloc(node);
if (!s)
return -ENOMEM;
@@ -148,7 +158,7 @@ void scs_release(struct task_struct *tsk)
{
void *s = task_scs(tsk);
- if (!s)
+ if (!scs_is_enabled() || !s)
return;
WARN(task_scs_end_corrupted(tsk),
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e9852d1b4a5e..cebf26445f9e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -388,6 +388,7 @@ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilte
}
#endif /* SECCOMP_ARCH_NATIVE */
+#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
@@ -397,7 +398,6 @@ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilte
*
* Returns valid seccomp BPF response codes.
*/
-#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
static u32 seccomp_run_filters(const struct seccomp_data *sd,
struct seccomp_filter **match)
{
diff --git a/kernel/signal.c b/kernel/signal.c
index d140672185a4..8cb28f1df294 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1255,7 +1255,7 @@ int send_signal_locked(int sig, struct kernel_siginfo *info,
static void print_fatal_signal(int signr)
{
- struct pt_regs *regs = signal_pt_regs();
+ struct pt_regs *regs = task_pt_regs(current);
pr_info("potentially unexpected fatal signal %d.\n", signr);
#if defined(__i386__) && !defined(__arch_um__)
@@ -2693,6 +2693,7 @@ relock:
/* Has this task already been marked for death? */
if ((signal->flags & SIGNAL_GROUP_EXIT) ||
signal->group_exec_task) {
+ clear_siginfo(&ksig->info);
ksig->info.si_signo = signr = SIGKILL;
sigdelset(&current->pending.signal, SIGKILL);
trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
@@ -2950,6 +2951,7 @@ void exit_signals(struct task_struct *tsk)
cgroup_threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+ sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
return;
@@ -2960,6 +2962,7 @@ void exit_signals(struct task_struct *tsk)
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
+ sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c
index dc5665b62814..639397b5491c 100644
--- a/kernel/static_call_inline.c
+++ b/kernel/static_call_inline.c
@@ -15,7 +15,18 @@ extern struct static_call_site __start_static_call_sites[],
extern struct static_call_tramp_key __start_static_call_tramp_key[],
__stop_static_call_tramp_key[];
-static bool static_call_initialized;
+static int static_call_initialized;
+
+/*
+ * Must be called before early_initcall() to be effective.
+ */
+void static_call_force_reinit(void)
+{
+ if (WARN_ON_ONCE(!static_call_initialized))
+ return;
+
+ static_call_initialized++;
+}
/* mutex to protect key modules/sites */
static DEFINE_MUTEX(static_call_mutex);
@@ -475,7 +486,8 @@ int __init static_call_init(void)
{
int ret;
- if (static_call_initialized)
+ /* See static_call_force_reinit(). */
+ if (static_call_initialized == 1)
return 0;
cpus_read_lock();
@@ -490,11 +502,12 @@ int __init static_call_init(void)
BUG();
}
- static_call_initialized = true;
-
#ifdef CONFIG_MODULES
- register_module_notifier(&static_call_module_nb);
+ if (!static_call_initialized)
+ register_module_notifier(&static_call_module_nb);
#endif
+
+ static_call_initialized = 1;
return 0;
}
early_initcall(static_call_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 5fd54bf0e886..88b31f096fb2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1442,6 +1442,8 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource,
if (resource >= RLIM_NLIMITS)
return -EINVAL;
+ resource = array_index_nospec(resource, RLIM_NLIMITS);
+
if (new_rlim) {
if (new_rlim->rlim_cur > new_rlim->rlim_max)
return -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c6d9dec11b74..137d4abe3eda 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1633,25 +1633,6 @@ int proc_do_static_key(struct ctl_table *table, int write,
}
static struct ctl_table kern_table[] = {
-#ifdef CONFIG_NUMA_BALANCING
- {
- .procname = "numa_balancing",
- .data = NULL, /* filled in by handler */
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sysctl_numa_balancing,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_FOUR,
- },
- {
- .procname = "numa_balancing_promote_rate_limit_MBps",
- .data = &sysctl_numa_balancing_promote_rate_limit,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
-#endif /* CONFIG_NUMA_BALANCING */
{
.procname = "panic",
.data = &panic_timeout,
@@ -2125,6 +2106,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&page_cluster_max,
},
{
.procname = "dirtytime_expire_seconds",
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index a41753be1a2b..bae8f11070be 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -200,10 +200,14 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
int "Clocksource watchdog maximum allowable skew (in μs)"
depends on CLOCKSOURCE_WATCHDOG
range 50 1000
- default 100
+ default 125
help
Specify the maximum amount of allowable watchdog skew in
microseconds before reporting the clocksource to be unstable.
+ The default is based on a half-second clocksource watchdog
+ interval and NTP's maximum frequency drift of 500 parts
+ per million. If the clocksource is good enough for NTP,
+ it is good enough for the clocksource watchdog!
endmenu
endif
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5897828b9d7e..7e5dff602585 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -470,11 +470,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
}
EXPORT_SYMBOL_GPL(alarm_forward);
-u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle)
{
struct alarm_base *base = &alarm_bases[alarm->type];
+ ktime_t now = base->get_ktime();
+
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) {
+ /*
+ * Same issue as with posix_timer_fn(). Timers which are
+ * periodic but the signal is ignored can starve the system
+ * with a very small interval. The real fix which was
+ * promised in the context of posix_timer_fn() never
+ * materialized, but someone should really work on it.
+ *
+ * To prevent DOS fake @now to be 1 jiffie out which keeps
+ * the overrun accounting correct but creates an
+ * inconsistency vs. timer_gettime(2).
+ */
+ ktime_t kj = NSEC_PER_SEC / HZ;
+
+ if (interval < kj)
+ now = ktime_add(now, kj);
+ }
+
+ return alarm_forward(alarm, now, interval);
+}
- return alarm_forward(alarm, base->get_ktime(), interval);
+u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+{
+ return __alarm_forward_now(alarm, interval, false);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
@@ -551,9 +575,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
/*
* Handle ignored signals and rearm the timer. This will go
- * away once we handle ignored signals proper.
+ * away once we handle ignored signals proper. Ensure that
+ * small intervals cannot starve the system.
*/
- ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
+ ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);
++ptr->it_requeue_pending;
ptr->it_active = 1;
result = ALARMTIMER_RESTART;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5d85014d59b5..960143b183cd 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -76,7 +76,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
}
/**
- * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds
* @latch: value to convert
* @evt: pointer to clock event device descriptor
*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 8058bec87ace..91836b727cef 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -96,6 +96,11 @@ static int finished_booting;
static u64 suspend_start;
/*
+ * Interval: 0.5sec.
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+
+/*
* Threshold: 0.0312s, when doubled: 0.0625s.
* Also a default for cs->uncertainty_margin when registering clocks.
*/
@@ -106,11 +111,14 @@ static u64 suspend_start;
* clocksource surrounding a read of the clocksource being validated.
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
+ *
+ * The default of 500 parts per million is based on NTP's limits.
+ * If a clocksource is good enough for NTP, it is good enough for us!
*/
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#else
-#define MAX_SKEW_USEC 100
+#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
#endif
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
@@ -140,11 +148,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);
-/*
- * Interval: 0.5sec.
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
-
static void clocksource_watchdog_work(struct work_struct *work)
{
/*
@@ -257,8 +260,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
goto skip_test;
}
- pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
- smp_processor_id(), watchdog->name, wd_delay, nretries);
+ pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
+ smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
return WD_READ_UNSTABLE;
skip_test:
@@ -310,7 +313,7 @@ static void clocksource_verify_choose_cpus(void)
* CPUs that are currently online.
*/
for (i = 1; i < n; i++) {
- cpu = prandom_u32_max(nr_cpu_ids);
+ cpu = get_random_u32_below(nr_cpu_ids);
cpu = cpumask_next(cpu - 1, cpu_online_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(cpu_online_mask);
@@ -384,6 +387,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+
static void clocksource_watchdog(struct timer_list *unused)
{
u64 csnow, wdnow, cslast, wdlast, delta;
@@ -391,6 +403,7 @@ static void clocksource_watchdog(struct timer_list *unused)
int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
+ unsigned long extra_wait = 0;
u32 md;
spin_lock(&watchdog_lock);
@@ -410,13 +423,30 @@ static void clocksource_watchdog(struct timer_list *unused)
read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
- if (read_ret != WD_READ_SUCCESS) {
- if (read_ret == WD_READ_UNSTABLE)
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
+ if (read_ret == WD_READ_UNSTABLE) {
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
continue;
}
+ /*
+ * When WD_READ_SKIP is returned, it means the system is likely
+ * under very heavy load, where the latency of reading
+ * watchdog/clocksource is very big, and affect the accuracy of
+ * watchdog check. So give system some space and suspend the
+ * watchdog check for 5 minutes.
+ */
+ if (read_ret == WD_READ_SKIP) {
+ /*
+ * As the watchdog timer will be suspended, and
+ * cs->last could keep unchanged for 5 minutes, reset
+ * the counters.
+ */
+ clocksource_reset_watchdog();
+ extra_wait = HZ * 300;
+ break;
+ }
+
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
atomic_read(&watchdog_reset_pending)) {
@@ -443,12 +473,20 @@ static void clocksource_watchdog(struct timer_list *unused)
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
+ u64 cs_wd_msec;
+ u64 wd_msec;
+ u32 wd_rem;
+
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, cs_nsec, csnow, cslast, cs->mask);
+ cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem);
+ wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem);
+ pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
+ cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
if (curr_clocksource == cs)
pr_warn(" '%s' is current clocksource.\n", cs->name);
else if (curr_clocksource)
@@ -512,7 +550,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
*/
if (!timer_pending(&watchdog_timer)) {
- watchdog_timer.expires += WATCHDOG_INTERVAL;
+ watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
add_timer_on(&watchdog_timer, next_cpu);
}
out:
@@ -537,14 +575,6 @@ static inline void clocksource_stop_watchdog(void)
watchdog_running = 0;
}
-static inline void clocksource_reset_watchdog(void)
-{
- struct clocksource *cs;
-
- list_for_each_entry(cs, &watchdog_list, wd_list)
- cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
-
static void clocksource_resume_watchdog(void)
{
atomic_inc(&watchdog_reset_pending);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3ae661ab6260..e8c08292defc 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2089,7 +2089,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
u64 slack;
slack = current->timer_slack_ns;
- if (dl_task(current) || rt_task(current))
+ if (rt_task(current))
slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
@@ -2126,6 +2126,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -2147,6 +2148,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
if (!timespec64_valid(&tu))
return -EINVAL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
@@ -2270,7 +2272,7 @@ void __init hrtimers_init(void)
/**
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t)
+ * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
* @mode: timer mode
* @clock_id: timer clock to be used
*/
@@ -2297,6 +2299,13 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
+ /*
+ * Override any slack passed by the user if under
+ * rt contraints.
+ */
+ if (rt_task(current))
+ delta = 0;
+
hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_sleeper_start_expires(&t, mode);
@@ -2316,7 +2325,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
/**
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t)
+ * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
* @mode: timer mode
*
* Make the current task sleep until the given expiry time has
@@ -2324,7 +2333,8 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
* the current task state has been set (see set_current_state()).
*
* The @delta argument gives the kernel the freedom to schedule the
- * actual wakeup to a time that is both power and performance friendly.
+ * actual wakeup to a time that is both power and performance friendly
+ * for regular (non RT/DL) tasks.
* The kernel give the normal best effort behavior for "@expires+@delta",
* but may decide to fire the timer earlier, but no earlier than @expires.
*
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index aec832801c26..0775b9ec952a 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -192,6 +192,24 @@ static void timens_setup_vdso_data(struct vdso_data *vdata,
offset[CLOCK_BOOTTIME_ALARM] = boottime;
}
+struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_mm == current->mm))
+ return current->nsproxy->time_ns->vvar_page;
+
+ /*
+ * VM_PFNMAP | VM_IO protect .fault() handler from being called
+ * through interfaces like /proc/$pid/mem or
+ * process_vm_{readv,writev}() as long as there's no .access()
+ * in special_mapping_vmops().
+ * For more details check_vma_flags() and __access_remote_vm()
+ */
+
+ WARN(1, "vvar_page accessed remotely");
+
+ return NULL;
+}
+
/*
* Protects possibly multiple offsets writers racing each other
* and tasks entering the namespace.
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index cb925e8ef9a8..2f5e9b34022c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -243,13 +243,12 @@ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
*/
static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- u64 curr_cputime;
-retry:
- curr_cputime = atomic64_read(cputime);
- if (sum_cputime > curr_cputime) {
- if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
- goto retry;
- }
+ u64 curr_cputime = atomic64_read(cputime);
+
+ do {
+ if (sum_cputime <= curr_cputime)
+ return;
+ } while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime));
}
static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 90ea5f373e50..828aeecbd1e8 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -147,6 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
texp = timespec64_to_ktime(t);
@@ -240,6 +241,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
texp = timespec64_to_ktime(t);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 5dead89308b7..0c8a87a11b39 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1270,6 +1270,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
@@ -1297,6 +1298,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
+ current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index 13b11eb62685..20d5df631570 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -149,7 +149,7 @@ module_init(udelay_test_init);
static void __exit udelay_test_exit(void)
{
mutex_lock(&udelay_test_lock);
- debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL));
+ debugfs_lookup_and_remove(DEBUGFS_FILENAME, NULL);
mutex_unlock(&udelay_test_lock);
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 797eb93103ad..e28f9210f8a1 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -56,25 +56,20 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* hrtimer callback function is currently running, then
* hrtimer_start() cannot move it and the timer stays on the CPU on
* which it is assigned at the moment.
+ */
+ hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /*
+ * The core tick broadcast mode expects bc->bound_on to be set
+ * correctly to prevent a CPU which has the broadcast hrtimer
+ * armed from going deep idle.
*
- * As this can be called from idle code, the hrtimer_start()
- * invocation has to be wrapped with RCU_NONIDLE() as
- * hrtimer_start() can call into tracing.
+ * As tick_broadcast_lock is held, nothing can change the cpu
+ * base which was just established in hrtimer_start() above. So
+ * the below access is safe even without holding the hrtimer
+ * base lock.
*/
- RCU_NONIDLE( {
- hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
- /*
- * The core tick broadcast mode expects bc->bound_on to be set
- * correctly to prevent a CPU which has the broadcast hrtimer
- * armed from going deep idle.
- *
- * As tick_broadcast_lock is held, nothing can change the cpu
- * base which was just established in hrtimer_start() above. So
- * the below access is safe even without holding the hrtimer
- * base lock.
- */
- bc->bound_on = bctimer.base->cpu_base->cpu;
- } );
+ bc->bound_on = bctimer.base->cpu_base->cpu;
+
return 0;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f7fe6fe36173..93bf2b4e47e5 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -622,9 +622,13 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)
* to avoid a deep idle transition as we are about to get the
* broadcast IPI right away.
*/
-int tick_check_broadcast_expired(void)
+noinstr int tick_check_broadcast_expired(void)
{
+#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
+ return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask));
+#else
return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+#endif
}
/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 475ecceda768..5e2c2c26b3cc 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,7 @@
#include "tick-internal.h"
/**
- * tick_program_event
+ * tick_program_event - program the CPU local timer device for the next event
*/
int tick_program_event(ktime_t expires, int force)
{
@@ -99,7 +99,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
}
/**
- * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ * tick_oneshot_mode_active - check whether the system is in oneshot mode
*
* returns 1 when either nohz or highres are enabled. otherwise 0.
*/
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 526257b3727c..f4198af60fee 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -462,7 +462,7 @@ struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec)
EXPORT_SYMBOL(ns_to_kernel_old_timeval);
/**
- * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ * set_normalized_timespec64 - set timespec sec and nsec parts and normalize
*
* @ts: pointer to timespec variable to be set
* @sec: seconds to set
@@ -526,7 +526,7 @@ struct timespec64 ns_to_timespec64(s64 nsec)
EXPORT_SYMBOL(ns_to_timespec64);
/**
- * msecs_to_jiffies: - convert milliseconds to jiffies
+ * __msecs_to_jiffies: - convert milliseconds to jiffies
* @m: time in milliseconds
*
* conversion is done as follows:
@@ -541,12 +541,12 @@ EXPORT_SYMBOL(ns_to_timespec64);
* handling any 32-bit overflows.
* for the details see __msecs_to_jiffies()
*
- * msecs_to_jiffies() checks for the passed in value being a constant
+ * __msecs_to_jiffies() checks for the passed in value being a constant
* via __builtin_constant_p() allowing gcc to eliminate most of the
* code, __msecs_to_jiffies() is called if the value passed does not
* allow constant folding and the actual conversion must be done at
* runtime.
- * the _msecs_to_jiffies helpers are the HZ dependent conversion
+ * The _msecs_to_jiffies helpers are the HZ dependent conversion
* routines found in include/linux/jiffies.h
*/
unsigned long __msecs_to_jiffies(const unsigned int m)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f72b9f1de178..5579ead449f2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1590,10 +1590,10 @@ void __weak read_persistent_clock64(struct timespec64 *ts)
/**
* read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
* from the boot.
+ * @wall_time: current time as returned by persistent clock
+ * @boot_offset: offset that is defined as wall_time - boot_time
*
* Weak dummy function for arches that do not yet support it.
- * @wall_time: - current time as returned by persistent clock
- * @boot_offset: - offset that is defined as wall_time - boot_time
*
* The default function calculates offset based on the current value of
* local_clock(). This way architectures that support sched_clock() but don't
@@ -1701,7 +1701,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
}
#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
-/**
+/*
* We have three kinds of time sources to use for sleep time
* injection, the preference order is:
* 1) non-stop clocksource
@@ -1722,7 +1722,7 @@ bool timekeeping_rtc_skipresume(void)
return !suspend_timing_needed;
}
-/**
+/*
* 1) can be determined whether to use or not only when doing
* timekeeping_resume() which is invoked after rtc_suspend(),
* so we can't skip rtc_suspend() surely if system has 1).
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 717fcb9fb14a..63a8ce7177dd 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1017,7 +1017,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
unsigned int idx = UINT_MAX;
int ret = 0;
- BUG_ON(!timer->function);
+ debug_assert_init(timer);
/*
* This is a common optimization triggered by the networking code - if
@@ -1044,6 +1044,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
* dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags);
+ /*
+ * Has @timer been shutdown? This needs to be evaluated
+ * while holding base lock to prevent a race against the
+ * shutdown code.
+ */
+ if (!timer->function)
+ goto out_unlock;
+
forward_timer_base(base);
if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
@@ -1070,6 +1078,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
} else {
base = lock_timer_base(timer, &flags);
+ /*
+ * Has @timer been shutdown? This needs to be evaluated
+ * while holding base lock to prevent a race against the
+ * shutdown code.
+ */
+ if (!timer->function)
+ goto out_unlock;
+
forward_timer_base(base);
}
@@ -1083,7 +1099,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
/*
* We are trying to schedule the timer on the new base.
* However we can't change timer's base while it is running,
- * otherwise del_timer_sync() can't detect that the timer's
+ * otherwise timer_delete_sync() can't detect that the timer's
* handler yet has not finished. This also guarantees that the
* timer is serialized wrt itself.
*/
@@ -1121,14 +1137,20 @@ out_unlock:
}
/**
- * mod_timer_pending - modify a pending timer's timeout
- * @timer: the pending timer to be modified
- * @expires: new timeout in jiffies
+ * mod_timer_pending - Modify a pending timer's timeout
+ * @timer: The pending timer to be modified
+ * @expires: New absolute timeout in jiffies
+ *
+ * mod_timer_pending() is the same for pending timers as mod_timer(), but
+ * will not activate inactive timers.
*
- * mod_timer_pending() is the same for pending timers as mod_timer(),
- * but will not re-activate and modify already deleted timers.
+ * If @timer->function == NULL then the start operation is silently
+ * discarded.
*
- * It is useful for unserialized use of timers.
+ * Return:
+ * * %0 - The timer was inactive and not modified or was in
+ * shutdown state and the operation was discarded
+ * * %1 - The timer was active and requeued to expire at @expires
*/
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
@@ -1137,24 +1159,31 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(mod_timer_pending);
/**
- * mod_timer - modify a timer's timeout
- * @timer: the timer to be modified
- * @expires: new timeout in jiffies
- *
- * mod_timer() is a more efficient way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
+ * mod_timer - Modify a timer's timeout
+ * @timer: The timer to be modified
+ * @expires: New absolute timeout in jiffies
*
* mod_timer(timer, expires) is equivalent to:
*
* del_timer(timer); timer->expires = expires; add_timer(timer);
*
+ * mod_timer() is more efficient than the above open coded sequence. In
+ * case that the timer is inactive, the del_timer() part is a NOP. The
+ * timer is in any case activated with the new expiry time @expires.
+ *
* Note that if there are multiple unserialized concurrent users of the
* same timer, then mod_timer() is the only safe way to modify the timeout,
* since add_timer() cannot modify an already running timer.
*
- * The function returns whether it has modified a pending timer or not.
- * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
- * active timer returns 1.)
+ * If @timer->function == NULL then the start operation is silently
+ * discarded. In this case the return value is 0 and meaningless.
+ *
+ * Return:
+ * * %0 - The timer was inactive and started or was in shutdown
+ * state and the operation was discarded
+ * * %1 - The timer was active and requeued to expire at @expires or
+ * the timer was active and not modified because @expires did
+ * not change the effective expiry time
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
@@ -1165,11 +1194,22 @@ EXPORT_SYMBOL(mod_timer);
/**
* timer_reduce - Modify a timer's timeout if it would reduce the timeout
* @timer: The timer to be modified
- * @expires: New timeout in jiffies
+ * @expires: New absolute timeout in jiffies
*
* timer_reduce() is very similar to mod_timer(), except that it will only
- * modify a running timer if that would reduce the expiration time (it will
- * start a timer that isn't running).
+ * modify an enqueued timer if that would reduce the expiration time. If
+ * @timer is not enqueued it starts the timer.
+ *
+ * If @timer->function == NULL then the start operation is silently
+ * discarded.
+ *
+ * Return:
+ * * %0 - The timer was inactive and started or was in shutdown
+ * state and the operation was discarded
+ * * %1 - The timer was active and requeued to expire at @expires or
+ * the timer was active and not modified because @expires
+ * did not change the effective expiry time such that the
+ * timer would expire earlier than already scheduled
*/
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
@@ -1178,39 +1218,51 @@ int timer_reduce(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(timer_reduce);
/**
- * add_timer - start a timer
- * @timer: the timer to be added
+ * add_timer - Start a timer
+ * @timer: The timer to be started
*
- * The kernel will do a ->function(@timer) callback from the
- * timer interrupt at the ->expires point in the future. The
- * current time is 'jiffies'.
+ * Start @timer to expire at @timer->expires in the future. @timer->expires
+ * is the absolute expiry time measured in 'jiffies'. When the timer expires
+ * timer->function(timer) will be invoked from soft interrupt context.
*
- * The timer's ->expires, ->function fields must be set prior calling this
- * function.
+ * The @timer->expires and @timer->function fields must be set prior
+ * to calling this function.
*
- * Timers with an ->expires field in the past will be executed in the next
- * timer tick.
+ * If @timer->function == NULL then the start operation is silently
+ * discarded.
+ *
+ * If @timer->expires is already in the past @timer will be queued to
+ * expire at the next timer tick.
+ *
+ * This can only operate on an inactive timer. Attempts to invoke this on
+ * an active timer are rejected with a warning.
*/
void add_timer(struct timer_list *timer)
{
- BUG_ON(timer_pending(timer));
+ if (WARN_ON_ONCE(timer_pending(timer)))
+ return;
__mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);
/**
- * add_timer_on - start a timer on a particular CPU
- * @timer: the timer to be added
- * @cpu: the CPU to start it on
+ * add_timer_on - Start a timer on a particular CPU
+ * @timer: The timer to be started
+ * @cpu: The CPU to start it on
+ *
+ * Same as add_timer() except that it starts the timer on the given CPU.
*
- * This is not very scalable on SMP. Double adds are not possible.
+ * See add_timer() for further details.
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
struct timer_base *new_base, *base;
unsigned long flags;
- BUG_ON(timer_pending(timer) || !timer->function);
+ debug_assert_init(timer);
+
+ if (WARN_ON_ONCE(timer_pending(timer)))
+ return;
new_base = get_timer_cpu_base(timer->flags, cpu);
@@ -1220,6 +1272,13 @@ void add_timer_on(struct timer_list *timer, int cpu)
* wrong base locked. See lock_timer_base().
*/
base = lock_timer_base(timer, &flags);
+ /*
+ * Has @timer been shutdown? This needs to be evaluated while
+ * holding base lock to prevent a race against the shutdown code.
+ */
+ if (!timer->function)
+ goto out_unlock;
+
if (base != new_base) {
timer->flags |= TIMER_MIGRATING;
@@ -1233,22 +1292,27 @@ void add_timer_on(struct timer_list *timer, int cpu)
debug_timer_activate(timer);
internal_add_timer(base, timer);
+out_unlock:
raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);
/**
- * del_timer - deactivate a timer.
- * @timer: the timer to be deactivated
- *
- * del_timer() deactivates a timer - this works on both active and inactive
- * timers.
- *
- * The function returns whether it has deactivated a pending timer or not.
- * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
- * active timer returns 1.)
+ * __timer_delete - Internal function: Deactivate a timer
+ * @timer: The timer to be deactivated
+ * @shutdown: If true, this indicates that the timer is about to be
+ * shutdown permanently.
+ *
+ * If @shutdown is true then @timer->function is set to NULL under the
+ * timer base lock which prevents further rearming of the time. In that
+ * case any attempt to rearm @timer after this function returns will be
+ * silently ignored.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
*/
-int del_timer(struct timer_list *timer)
+static int __timer_delete(struct timer_list *timer, bool shutdown)
{
struct timer_base *base;
unsigned long flags;
@@ -1256,24 +1320,90 @@ int del_timer(struct timer_list *timer)
debug_assert_init(timer);
- if (timer_pending(timer)) {
+ /*
+ * If @shutdown is set then the lock has to be taken whether the
+ * timer is pending or not to protect against a concurrent rearm
+ * which might hit between the lockless pending check and the lock
+ * aquisition. By taking the lock it is ensured that such a newly
+ * enqueued timer is dequeued and cannot end up with
+ * timer->function == NULL in the expiry code.
+ *
+ * If timer->function is currently executed, then this makes sure
+ * that the callback cannot requeue the timer.
+ */
+ if (timer_pending(timer) || shutdown) {
base = lock_timer_base(timer, &flags);
ret = detach_if_pending(timer, base, true);
+ if (shutdown)
+ timer->function = NULL;
raw_spin_unlock_irqrestore(&base->lock, flags);
}
return ret;
}
-EXPORT_SYMBOL(del_timer);
/**
- * try_to_del_timer_sync - Try to deactivate a timer
- * @timer: timer to delete
+ * timer_delete - Deactivate a timer
+ * @timer: The timer to be deactivated
+ *
+ * The function only deactivates a pending timer, but contrary to
+ * timer_delete_sync() it does not take into account whether the timer's
+ * callback function is concurrently executed on a different CPU or not.
+ * It neither prevents rearming of the timer. If @timer can be rearmed
+ * concurrently then the return value of this function is meaningless.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ */
+int timer_delete(struct timer_list *timer)
+{
+ return __timer_delete(timer, false);
+}
+EXPORT_SYMBOL(timer_delete);
+
+/**
+ * timer_shutdown - Deactivate a timer and prevent rearming
+ * @timer: The timer to be deactivated
*
- * This function tries to deactivate a timer. Upon successful (ret >= 0)
- * exit the timer is not queued and the handler is not running on any CPU.
+ * The function does not wait for an eventually running timer callback on a
+ * different CPU but it prevents rearming of the timer. Any attempt to arm
+ * @timer after this function returns will be silently ignored.
+ *
+ * This function is useful for teardown code and should only be used when
+ * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending
*/
-int try_to_del_timer_sync(struct timer_list *timer)
+int timer_shutdown(struct timer_list *timer)
+{
+ return __timer_delete(timer, true);
+}
+EXPORT_SYMBOL_GPL(timer_shutdown);
+
+/**
+ * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
+ * @timer: Timer to deactivate
+ * @shutdown: If true, this indicates that the timer is about to be
+ * shutdown permanently.
+ *
+ * If @shutdown is true then @timer->function is set to NULL under the
+ * timer base lock which prevents further rearming of the timer. Any
+ * attempt to rearm @timer after this function returns will be silently
+ * ignored.
+ *
+ * This function cannot guarantee that the timer cannot be rearmed
+ * right after dropping the base lock if @shutdown is false. That
+ * needs to be prevented by the calling code if necessary.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ * * %-1 - The timer callback function is running on a different CPU
+ */
+static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
{
struct timer_base *base;
unsigned long flags;
@@ -1285,11 +1415,34 @@ int try_to_del_timer_sync(struct timer_list *timer)
if (base->running_timer != timer)
ret = detach_if_pending(timer, base, true);
+ if (shutdown)
+ timer->function = NULL;
raw_spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
+
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: Timer to deactivate
+ *
+ * This function tries to deactivate a timer. On success the timer is not
+ * queued and the timer callback function is not running on any CPU.
+ *
+ * This function does not guarantee that the timer cannot be rearmed right
+ * after dropping the base lock. That needs to be prevented by the calling
+ * code if necessary.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ * * %-1 - The timer callback function is running on a different CPU
+ */
+int try_to_del_timer_sync(struct timer_list *timer)
+{
+ return __try_to_del_timer_sync(timer, false);
+}
EXPORT_SYMBOL(try_to_del_timer_sync);
#ifdef CONFIG_PREEMPT_RT
@@ -1365,44 +1518,29 @@ static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
/**
- * del_timer_sync - deactivate a timer and wait for the handler to finish.
- * @timer: the timer to be deactivated
- *
- * This function only differs from del_timer() on SMP: besides deactivating
- * the timer it also makes sure the handler has finished executing on other
- * CPUs.
- *
- * Synchronization rules: Callers must prevent restarting of the timer,
- * otherwise this function is meaningless. It must not be called from
- * interrupt contexts unless the timer is an irqsafe one. The caller must
- * not hold locks which would prevent completion of the timer's
- * handler. The timer's handler must not call add_timer_on(). Upon exit the
- * timer is not queued and the handler is not running on any CPU.
- *
- * Note: For !irqsafe timers, you must not hold locks that are held in
- * interrupt context while calling this function. Even if the lock has
- * nothing to do with the timer in question. Here's why::
- *
- * CPU0 CPU1
- * ---- ----
- * <SOFTIRQ>
- * call_timer_fn();
- * base->running_timer = mytimer;
- * spin_lock_irq(somelock);
- * <IRQ>
- * spin_lock(somelock);
- * del_timer_sync(mytimer);
- * while (base->running_timer == mytimer);
- *
- * Now del_timer_sync() will never return and never release somelock.
- * The interrupt on the other CPU is waiting to grab somelock but
- * it has interrupted the softirq that CPU0 is waiting to finish.
- *
- * The function returns whether it has deactivated a pending timer or not.
+ * __timer_delete_sync - Internal function: Deactivate a timer and wait
+ * for the handler to finish.
+ * @timer: The timer to be deactivated
+ * @shutdown: If true, @timer->function will be set to NULL under the
+ * timer base lock which prevents rearming of @timer
+ *
+ * If @shutdown is not set the timer can be rearmed later. If the timer can
+ * be rearmed concurrently, i.e. after dropping the base lock then the
+ * return value is meaningless.
+ *
+ * If @shutdown is set then @timer->function is set to NULL under timer
+ * base lock which prevents rearming of the timer. Any attempt to rearm
+ * a shutdown timer is silently ignored.
+ *
+ * If the timer should be reused after shutdown it has to be initialized
+ * again.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
*/
-int del_timer_sync(struct timer_list *timer)
+static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
{
int ret;
@@ -1422,7 +1560,7 @@ int del_timer_sync(struct timer_list *timer)
* don't use it in hardirq context, because it
* could lead to deadlock.
*/
- WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
+ WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));
/*
* Must be able to sleep on PREEMPT_RT because of the slowpath in
@@ -1432,7 +1570,7 @@ int del_timer_sync(struct timer_list *timer)
lockdep_assert_preemption_enabled();
do {
- ret = try_to_del_timer_sync(timer);
+ ret = __try_to_del_timer_sync(timer, shutdown);
if (unlikely(ret < 0)) {
del_timer_wait_running(timer);
@@ -1442,8 +1580,96 @@ int del_timer_sync(struct timer_list *timer)
return ret;
}
-EXPORT_SYMBOL(del_timer_sync);
-#endif
+
+/**
+ * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
+ * @timer: The timer to be deactivated
+ *
+ * Synchronization rules: Callers must prevent restarting of the timer,
+ * otherwise this function is meaningless. It must not be called from
+ * interrupt contexts unless the timer is an irqsafe one. The caller must
+ * not hold locks which would prevent completion of the timer's callback
+ * function. The timer's handler must not call add_timer_on(). Upon exit
+ * the timer is not queued and the handler is not running on any CPU.
+ *
+ * For !irqsafe timers, the caller must not hold locks that are held in
+ * interrupt context. Even if the lock has nothing to do with the timer in
+ * question. Here's why::
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * <SOFTIRQ>
+ * call_timer_fn();
+ * base->running_timer = mytimer;
+ * spin_lock_irq(somelock);
+ * <IRQ>
+ * spin_lock(somelock);
+ * timer_delete_sync(mytimer);
+ * while (base->running_timer == mytimer);
+ *
+ * Now timer_delete_sync() will never return and never release somelock.
+ * The interrupt on the other CPU is waiting to grab somelock but it has
+ * interrupted the softirq that CPU0 is waiting to finish.
+ *
+ * This function cannot guarantee that the timer is not rearmed again by
+ * some concurrent or preempting code, right after it dropped the base
+ * lock. If there is the possibility of a concurrent rearm then the return
+ * value of the function is meaningless.
+ *
+ * If such a guarantee is needed, e.g. for teardown situations then use
+ * timer_shutdown_sync() instead.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending and deactivated
+ */
+int timer_delete_sync(struct timer_list *timer)
+{
+ return __timer_delete_sync(timer, false);
+}
+EXPORT_SYMBOL(timer_delete_sync);
+
+/**
+ * timer_shutdown_sync - Shutdown a timer and prevent rearming
+ * @timer: The timer to be shutdown
+ *
+ * When the function returns it is guaranteed that:
+ * - @timer is not queued
+ * - The callback function of @timer is not running
+ * - @timer cannot be enqueued again. Any attempt to rearm
+ * @timer is silently ignored.
+ *
+ * See timer_delete_sync() for synchronization rules.
+ *
+ * This function is useful for final teardown of an infrastructure where
+ * the timer is subject to a circular dependency problem.
+ *
+ * A common pattern for this is a timer and a workqueue where the timer can
+ * schedule work and work can arm the timer. On shutdown the workqueue must
+ * be destroyed and the timer must be prevented from rearming. Unless the
+ * code has conditionals like 'if (mything->in_shutdown)' to prevent that
+ * there is no way to get this correct with timer_delete_sync().
+ *
+ * timer_shutdown_sync() is solving the problem. The correct ordering of
+ * calls in this case is:
+ *
+ * timer_shutdown_sync(&mything->timer);
+ * workqueue_destroy(&mything->workqueue);
+ *
+ * After this 'mything' can be safely freed.
+ *
+ * This obviously implies that the timer is not required to be functional
+ * for the rest of the shutdown operation.
+ *
+ * Return:
+ * * %0 - The timer was not pending
+ * * %1 - The timer was pending
+ */
+int timer_shutdown_sync(struct timer_list *timer)
+{
+ return __timer_delete_sync(timer, true);
+}
+EXPORT_SYMBOL_GPL(timer_shutdown_sync);
static void call_timer_fn(struct timer_list *timer,
void (*fn)(struct timer_list *),
@@ -1465,8 +1691,8 @@ static void call_timer_fn(struct timer_list *timer,
#endif
/*
* Couple the lock chain with the lock chain at
- * del_timer_sync() by acquiring the lock_map around the fn()
- * call here and in del_timer_sync().
+ * timer_delete_sync() by acquiring the lock_map around the fn()
+ * call here and in timer_delete_sync().
*/
lock_map_acquire(&lockdep_map);
@@ -1509,6 +1735,12 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
fn = timer->function;
+ if (WARN_ON_ONCE(!fn)) {
+ /* Should never happen. Emphasis on should! */
+ base->running_timer = NULL;
+ continue;
+ }
+
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
@@ -1933,7 +2165,7 @@ signed long __sched schedule_timeout(signed long timeout)
timer_setup_on_stack(&timer.timer, process_timeout, 0);
__mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
schedule();
- del_singleshot_timer_sync(&timer.timer);
+ del_timer_sync(&timer.timer);
/* Remove the timer from the object tracker */
destroy_timer_on_stack(&timer.timer);
@@ -2017,8 +2249,6 @@ int timers_dead_cpu(unsigned int cpu)
struct timer_base *new_base;
int b, i;
- BUG_ON(cpu_online(cpu));
-
for (b = 0; b < NR_BASES; b++) {
old_base = per_cpu_ptr(&timer_bases[b], cpu);
new_base = get_cpu_ptr(&timer_bases[b]);
@@ -2035,7 +2265,8 @@ int timers_dead_cpu(unsigned int cpu)
*/
forward_timer_base(new_base);
- BUG_ON(old_base->running_timer);
+ WARN_ON_ONCE(old_base->running_timer);
+ old_base->running_timer = NULL;
for (i = 0; i < WHEEL_SIZE; i++)
migrate_timer_list(new_base, old_base->vectors + i);
diff --git a/kernel/torture.c b/kernel/torture.c
index 789aeb0e1159..1a0519b836ac 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -450,7 +450,7 @@ unsigned long
torture_random(struct torture_random_state *trsp)
{
if (--trsp->trs_count < 0) {
- trsp->trs_state += (unsigned long)local_clock();
+ trsp->trs_state += (unsigned long)local_clock() + raw_smp_processor_id();
trsp->trs_count = TORTURE_RANDOM_REFRESH;
}
trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
@@ -915,7 +915,7 @@ void torture_kthread_stopping(char *title)
VERBOSE_TOROUT_STRING(buf);
while (!kthread_should_stop()) {
torture_shutdown_absorb(title);
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
}
EXPORT_SYMBOL_GPL(torture_kthread_stopping);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e9e95c790b8e..a856d4a34c67 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -42,14 +42,17 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
bool
+config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
+ bool
+
config HAVE_DYNAMIC_FTRACE_WITH_ARGS
bool
help
If this is set, then arguments and stack can be found from
- the pt_regs passed into the function callback regs parameter
+ the ftrace_regs passed into the function callback regs parameter
by default, even without setting the REGS flag in the ftrace_ops.
- This allows for use of regs_get_kernel_argument() and
- kernel_stack_pointer().
+ This allows for use of ftrace_regs_get_argument() and
+ ftrace_regs_get_stack_pointer().
config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
bool
@@ -82,6 +85,13 @@ config HAVE_OBJTOOL_MCOUNT
help
Arch supports objtool --mcount
+config HAVE_OBJTOOL_NOP_MCOUNT
+ bool
+ help
+ Arch supports the objtool options --mcount with --mnop.
+ An architecture can select this if it wants to enable nop'ing
+ of ftrace locations.
+
config HAVE_C_RECORDMCOUNT
bool
help
@@ -232,7 +242,7 @@ config DYNAMIC_FTRACE
enabled, and the functions not enabled will not affect
performance of the system.
- See the files in /sys/kernel/debug/tracing:
+ See the files in /sys/kernel/tracing:
available_filter_functions
set_ftrace_filter
set_ftrace_notrace
@@ -250,6 +260,10 @@ config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
depends on DYNAMIC_FTRACE_WITH_REGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+config DYNAMIC_FTRACE_WITH_CALL_OPS
+ def_bool y
+ depends on HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
+
config DYNAMIC_FTRACE_WITH_ARGS
def_bool y
depends on DYNAMIC_FTRACE
@@ -292,7 +306,7 @@ config STACK_TRACER
select KALLSYMS
help
This special tracer records the maximum stack footprint of the
- kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
+ kernel and displays it in /sys/kernel/tracing/stack_trace.
This tracer works by hooking into every function call that the
kernel executes, and keeping a maximum stack depth value and
@@ -332,7 +346,7 @@ config IRQSOFF_TRACER
disabled by default and can be runtime (re-)started
via:
- echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
+ echo 0 > /sys/kernel/tracing/tracing_max_latency
(Note that kernel size and overhead increase with this option
enabled. This option and the preempt-off timing option can be
@@ -356,7 +370,7 @@ config PREEMPT_TRACER
disabled by default and can be runtime (re-)started
via:
- echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
+ echo 0 > /sys/kernel/tracing/tracing_max_latency
(Note that kernel size and overhead increase with this option
enabled. This option and the irqs-off timing option can be
@@ -375,6 +389,7 @@ config SCHED_TRACER
config HWLAT_TRACER
bool "Tracer to detect hardware latencies (like SMIs)"
select GENERIC_TRACER
+ select TRACER_MAX_TRACE
help
This tracer, when enabled will create one or more kernel threads,
depending on what the cpumask file is set to, which each thread
@@ -410,6 +425,7 @@ config HWLAT_TRACER
config OSNOISE_TRACER
bool "OS Noise tracer"
select GENERIC_TRACER
+ select TRACER_MAX_TRACE
help
In the context of high-performance computing (HPC), the Operating
System Noise (osnoise) refers to the interference experienced by an
@@ -506,7 +522,7 @@ config TRACER_SNAPSHOT
Allow tracing users to take snapshot of the current buffer using the
ftrace interface, e.g.:
- echo 1 > /sys/kernel/debug/tracing/snapshot
+ echo 1 > /sys/kernel/tracing/snapshot
cat snapshot
config TRACER_SNAPSHOT_PER_CPU_SWAP
@@ -518,7 +534,7 @@ config TRACER_SNAPSHOT_PER_CPU_SWAP
full swap (all buffers). If this is set, then the following is
allowed:
- echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
+ echo 1 > /sys/kernel/tracing/per_cpu/cpu2/snapshot
After which, only the tracing buffer for CPU 2 was swapped with
the main tracing buffer, and the other CPU buffers remain the same.
@@ -565,7 +581,7 @@ config PROFILE_ANNOTATED_BRANCHES
This tracer profiles all likely and unlikely macros
in the kernel. It will display the results in:
- /sys/kernel/debug/tracing/trace_stat/branch_annotated
+ /sys/kernel/tracing/trace_stat/branch_annotated
Note: this will add a significant overhead; only turn this
on if you need to profile the system's use of these macros.
@@ -578,7 +594,7 @@ config PROFILE_ALL_BRANCHES
taken in the kernel is recorded whether it hit or miss.
The results will be displayed in:
- /sys/kernel/debug/tracing/trace_stat/branch_all
+ /sys/kernel/tracing/trace_stat/branch_all
This option also enables the likely/unlikely profiler.
@@ -629,8 +645,8 @@ config BLK_DEV_IO_TRACE
Tracing also is possible using the ftrace interface, e.g.:
echo 1 > /sys/block/sda/sda1/trace/enable
- echo blk > /sys/kernel/debug/tracing/current_tracer
- cat /sys/kernel/debug/tracing/trace_pipe
+ echo blk > /sys/kernel/tracing/current_tracer
+ cat /sys/kernel/tracing/trace_pipe
If unsure, say N.
@@ -924,8 +940,8 @@ config RING_BUFFER_RECORD_RECURSION
default y
help
The ring buffer has its own internal recursion. Although when
- recursion happens it wont cause harm because of the protection,
- but it does cause an unwanted overhead. Enabling this option will
+ recursion happens it won't cause harm because of the protection,
+ but it does cause unwanted overhead. Enabling this option will
place where recursion was detected into the ftrace "recursed_functions"
file.
@@ -1008,8 +1024,8 @@ config RING_BUFFER_STARTUP_TEST
The test runs for 10 seconds. This will slow your boot time
by at least 10 more seconds.
- At the end of the test, statics and more checks are done.
- It will output the stats of each per cpu buffer. What
+ At the end of the test, statistics and more checks are done.
+ It will output the stats of each per cpu buffer: What
was written, the sizes, what was read, what was lost, and
other similar details.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a995ea1ef849..5743be559415 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -320,8 +320,8 @@ static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
* under 'q->debugfs_dir', thus lookup and remove them.
*/
if (!bt->dir) {
- debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir));
- debugfs_remove(debugfs_lookup("msg", q->debugfs_dir));
+ debugfs_lookup_and_remove("dropped", q->debugfs_dir);
+ debugfs_lookup_and_remove("msg", q->debugfs_dir);
} else {
debugfs_remove(bt->dir);
}
@@ -721,7 +721,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
*/
/**
- * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * blk_trace_ioctl - handle the ioctls associated with tracing
* @bdev: the block device
* @cmd: the ioctl cmd
* @arg: the argument data, if any
@@ -769,7 +769,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
}
/**
- * blk_trace_shutdown: - stop and cleanup trace structures
+ * blk_trace_shutdown - stop and cleanup trace structures
* @q: the request queue associated with the device
*
**/
@@ -1548,7 +1548,8 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
{
- if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+ if ((iter->ent->type != TRACE_BLK) ||
+ !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
return TRACE_TYPE_UNHANDLED;
return print_one_line(iter, true);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1ed08967fb97..e8da032bb6fc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -6,6 +6,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
#include <linux/bpf_perf_event.h>
#include <linux/btf.h>
#include <linux/filter.h>
@@ -368,8 +369,6 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
return &bpf_probe_write_user_proto;
}
-static DEFINE_RAW_SPINLOCK(trace_printk_lock);
-
#define MAX_TRACE_PRINTK_VARARGS 3
#define BPF_TRACE_PRINTK_SIZE 1024
@@ -377,23 +376,22 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
{
u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
- u32 *bin_args;
- static char buf[BPF_TRACE_PRINTK_SIZE];
- unsigned long flags;
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ .get_buf = true,
+ };
int ret;
- ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args,
- MAX_TRACE_PRINTK_VARARGS);
+ ret = bpf_bprintf_prepare(fmt, fmt_size, args,
+ MAX_TRACE_PRINTK_VARARGS, &data);
if (ret < 0)
return ret;
- raw_spin_lock_irqsave(&trace_printk_lock, flags);
- ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
+ ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);
- trace_bpf_trace_printk(buf);
- raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
+ trace_bpf_trace_printk(data.buf);
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(&data);
return ret;
}
@@ -426,30 +424,29 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, data,
+BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args,
u32, data_len)
{
- static char buf[BPF_TRACE_PRINTK_SIZE];
- unsigned long flags;
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ .get_buf = true,
+ };
int ret, num_args;
- u32 *bin_args;
if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
- (data_len && !data))
+ (data_len && !args))
return -EINVAL;
num_args = data_len / 8;
- ret = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
+ ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
if (ret < 0)
return ret;
- raw_spin_lock_irqsave(&trace_printk_lock, flags);
- ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
+ ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);
- trace_bpf_trace_printk(buf);
- raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
+ trace_bpf_trace_printk(data.buf);
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(&data);
return ret;
}
@@ -471,23 +468,25 @@ const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
}
BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
- const void *, data, u32, data_len)
+ const void *, args, u32, data_len)
{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ };
int err, num_args;
- u32 *bin_args;
if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
- (data_len && !data))
+ (data_len && !args))
return -EINVAL;
num_args = data_len / 8;
- err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
+ err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
if (err < 0)
return err;
- seq_bprintf(m, fmt, bin_args);
+ seq_bprintf(m, fmt, data.bin_args);
- bpf_bprintf_cleanup();
+ bpf_bprintf_cleanup(&data);
return seq_has_overflowed(m) ? -EOVERFLOW : 0;
}
@@ -686,8 +685,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
}
perf_sample_data_init(sd, 0, 0);
- sd->raw = &raw;
- sd->sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(sd, &raw);
err = __bpf_perf_event_output(regs, map, flags, sd);
@@ -745,8 +743,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
- sd->raw = &raw;
- sd->sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(sd, &raw);
ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
@@ -773,7 +770,7 @@ BPF_CALL_0(bpf_get_current_task_btf)
const struct bpf_func_proto bpf_get_current_task_btf_proto = {
.func = bpf_get_current_task_btf,
.gpl_only = true,
- .ret_type = RET_PTR_TO_BTF_ID,
+ .ret_type = RET_PTR_TO_BTF_ID_TRUSTED,
.ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
@@ -832,6 +829,7 @@ static void do_bpf_send_signal(struct irq_work *entry)
work = container_of(entry, struct send_signal_irq_work, irq_work);
group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
+ put_task_struct(work->task);
}
static int bpf_send_signal_common(u32 sig, enum pid_type type)
@@ -847,6 +845,9 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
+ /* Task should not be pid=1 to avoid kernel panic. */
+ if (unlikely(is_global_init(current)))
+ return -EPERM;
if (irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
@@ -863,7 +864,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
* to the irq_work. The current task may change when queued
* irq works get executed.
*/
- work->task = current;
+ work->task = get_task_struct(current);
work->sig = sig;
work->type = type;
irq_work_queue(&work->irq_work);
@@ -1234,7 +1235,7 @@ __diag_ignore_all("-Wmissing-prototypes",
* Return: a bpf_key pointer with a valid key pointer if the key is found, a
* NULL pointer otherwise.
*/
-struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
+__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
{
key_ref_t key_ref;
struct bpf_key *bkey;
@@ -1283,7 +1284,7 @@ struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
* Return: a bpf_key pointer with an invalid key pointer set from the
* pre-determined ID on success, a NULL pointer otherwise
*/
-struct bpf_key *bpf_lookup_system_key(u64 id)
+__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
{
struct bpf_key *bkey;
@@ -1307,7 +1308,7 @@ struct bpf_key *bpf_lookup_system_key(u64 id)
* Decrement the reference count of the key inside *bkey*, if the pointer
* is valid, and free *bkey*.
*/
-void bpf_key_put(struct bpf_key *bkey)
+__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
{
if (bkey->has_ref)
key_put(bkey->key);
@@ -1327,7 +1328,7 @@ void bpf_key_put(struct bpf_key *bkey)
*
* Return: 0 on success, a negative value on error.
*/
-int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
struct bpf_dynptr_kern *sig_ptr,
struct bpf_key *trusted_keyring)
{
@@ -1456,6 +1457,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_get_current_ancestor_cgroup_id:
return &bpf_get_current_ancestor_cgroup_id_proto;
+ case BPF_FUNC_cgrp_storage_get:
+ return &bpf_cgrp_storage_get_proto;
+ case BPF_FUNC_cgrp_storage_delete:
+ return &bpf_cgrp_storage_delete_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
@@ -1480,9 +1485,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_task_stack:
return &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
- return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+ return &bpf_copy_from_user_proto;
case BPF_FUNC_copy_from_user_task:
- return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
+ return &bpf_copy_from_user_task_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_per_cpu_ptr:
@@ -1490,8 +1495,12 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
case BPF_FUNC_task_storage_get:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_get_recur_proto;
return &bpf_task_storage_get_proto;
case BPF_FUNC_task_storage_delete:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_delete_recur_proto;
return &bpf_task_storage_delete_proto;
case BPF_FUNC_for_each_map_elem:
return &bpf_for_each_map_elem_proto;
@@ -2452,6 +2461,8 @@ struct bpf_kprobe_multi_link {
unsigned long *addrs;
u64 *cookies;
u32 cnt;
+ u32 mods_cnt;
+ struct module **mods;
};
struct bpf_kprobe_multi_run_ctx {
@@ -2507,6 +2518,14 @@ error:
return err;
}
+static void kprobe_multi_put_modules(struct module **mods, u32 cnt)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++)
+ module_put(mods[i]);
+}
+
static void free_user_syms(struct user_syms *us)
{
kvfree(us->syms);
@@ -2519,6 +2538,7 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link)
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
unregister_fprobe(&kmulti_link->fp);
+ kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
}
static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
@@ -2528,6 +2548,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
kvfree(kmulti_link->addrs);
kvfree(kmulti_link->cookies);
+ kfree(kmulti_link->mods);
kfree(kmulti_link);
}
@@ -2550,7 +2571,7 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void
swap(*cookie_a, *cookie_b);
}
-static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
+static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b)
{
const unsigned long *addr_a = a, *addr_b = b;
@@ -2561,7 +2582,7 @@ static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
{
- return __bpf_kprobe_multi_cookie_cmp(a, b);
+ return bpf_kprobe_multi_addrs_cmp(a, b);
}
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
@@ -2579,7 +2600,7 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
return 0;
entry_ip = run_ctx->entry_ip;
addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
- __bpf_kprobe_multi_cookie_cmp);
+ bpf_kprobe_multi_addrs_cmp);
if (!addr)
return 0;
cookie = link->cookies + (addr - link->addrs);
@@ -2663,6 +2684,79 @@ static void symbols_swap_r(void *a, void *b, int size, const void *priv)
}
}
+struct modules_array {
+ struct module **mods;
+ int mods_cnt;
+ int mods_cap;
+};
+
+static int add_module(struct modules_array *arr, struct module *mod)
+{
+ struct module **mods;
+
+ if (arr->mods_cnt == arr->mods_cap) {
+ arr->mods_cap = max(16, arr->mods_cap * 3 / 2);
+ mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL);
+ if (!mods)
+ return -ENOMEM;
+ arr->mods = mods;
+ }
+
+ arr->mods[arr->mods_cnt] = mod;
+ arr->mods_cnt++;
+ return 0;
+}
+
+static bool has_module(struct modules_array *arr, struct module *mod)
+{
+ int i;
+
+ for (i = arr->mods_cnt - 1; i >= 0; i--) {
+ if (arr->mods[i] == mod)
+ return true;
+ }
+ return false;
+}
+
+static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt)
+{
+ struct modules_array arr = {};
+ u32 i, err = 0;
+
+ for (i = 0; i < addrs_cnt; i++) {
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_address(addrs[i]);
+ /* Either no module or we it's already stored */
+ if (!mod || has_module(&arr, mod)) {
+ preempt_enable();
+ continue;
+ }
+ if (!try_module_get(mod))
+ err = -EINVAL;
+ preempt_enable();
+ if (err)
+ break;
+ err = add_module(&arr, mod);
+ if (err) {
+ module_put(mod);
+ break;
+ }
+ }
+
+ /* We return either err < 0 in case of error, ... */
+ if (err) {
+ kprobe_multi_put_modules(arr.mods, arr.mods_cnt);
+ kfree(arr.mods);
+ return err;
+ }
+
+ /* or number of modules found if everything is ok. */
+ *mods = arr.mods;
+ return arr.mods_cnt;
+}
+
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_kprobe_multi_link *link = NULL;
@@ -2775,8 +2869,16 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
link);
}
+ err = get_modules_for_addrs(&link->mods, addrs, cnt);
+ if (err < 0) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ link->mods_cnt = err;
+
err = register_fprobe_ips(&link->fp, addrs, cnt);
if (err) {
+ kprobe_multi_put_modules(link->mods, link->mods_cnt);
bpf_link_cleanup(&link_primer);
return err;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 33236241f236..29baa97d0d53 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -125,6 +125,33 @@ struct ftrace_ops global_ops;
void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+/*
+ * Stub used to invoke the list ops without requiring a separate trampoline.
+ */
+const struct ftrace_ops ftrace_list_ops = {
+ .func = ftrace_ops_list_func,
+ .flags = FTRACE_OPS_FL_STUB,
+};
+
+static void ftrace_ops_nop_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op,
+ struct ftrace_regs *fregs)
+{
+ /* do nothing */
+}
+
+/*
+ * Stub used when a call site is disabled. May be called transiently by threads
+ * which have made it into ftrace_caller but haven't yet recovered the ops at
+ * the point the call site is disabled.
+ */
+const struct ftrace_ops ftrace_nop_ops = {
+ .func = ftrace_ops_nop_func,
+ .flags = FTRACE_OPS_FL_STUB,
+};
+#endif
+
static inline void ftrace_ops_init(struct ftrace_ops *ops)
{
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -163,7 +190,7 @@ static void ftrace_sync_ipi(void *data)
static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
{
/*
- * If this is a dynamic, RCU, or per CPU ops, or we force list func,
+ * If this is a dynamic or RCU ops, or we force list func,
* then it needs to call the list anyway.
*/
if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
@@ -1248,12 +1275,17 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
call_rcu(&hash->rcu, __free_ftrace_hash_rcu);
}
+/**
+ * ftrace_free_filter - remove all filters for an ftrace_ops
+ * @ops - the ops to remove the filters from
+ */
void ftrace_free_filter(struct ftrace_ops *ops)
{
ftrace_ops_init(ops);
free_ftrace_hash(ops->func_hash->filter_hash);
free_ftrace_hash(ops->func_hash->notrace_hash);
}
+EXPORT_SYMBOL_GPL(ftrace_free_filter);
static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
{
@@ -1814,6 +1846,18 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
* if rec count is zero.
*/
}
+
+ /*
+ * If the rec has a single associated ops, and ops->func can be
+ * called directly, allow the call site to call via the ops.
+ */
+ if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS) &&
+ ftrace_rec_count(rec) == 1 &&
+ ftrace_ops_get_func(ops) == ops->func)
+ rec->flags |= FTRACE_FL_CALL_OPS;
+ else
+ rec->flags &= ~FTRACE_FL_CALL_OPS;
+
count++;
/* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */
@@ -2108,8 +2152,9 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
struct ftrace_ops *ops = NULL;
pr_info("ftrace record flags: %lx\n", rec->flags);
- pr_cont(" (%ld)%s", ftrace_rec_count(rec),
- rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ pr_cont(" (%ld)%s%s", ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ",
+ rec->flags & FTRACE_FL_CALL_OPS ? " O" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
if (ops) {
@@ -2177,6 +2222,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
* want the direct enabled (it will be done via the
* direct helper). But if DIRECT_EN is set, and
* the count is not one, we need to clear it.
+ *
*/
if (ftrace_rec_count(rec) == 1) {
if (!(rec->flags & FTRACE_FL_DIRECT) !=
@@ -2185,6 +2231,19 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
} else if (rec->flags & FTRACE_FL_DIRECT_EN) {
flag |= FTRACE_FL_DIRECT;
}
+
+ /*
+ * Ops calls are special, as count matters.
+ * As with direct calls, they must only be enabled when count
+ * is one, otherwise they'll be handled via the list ops.
+ */
+ if (ftrace_rec_count(rec) == 1) {
+ if (!(rec->flags & FTRACE_FL_CALL_OPS) !=
+ !(rec->flags & FTRACE_FL_CALL_OPS_EN))
+ flag |= FTRACE_FL_CALL_OPS;
+ } else if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
+ flag |= FTRACE_FL_CALL_OPS;
+ }
}
/* If the state of this record hasn't changed, then do nothing */
@@ -2229,6 +2288,21 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
rec->flags &= ~FTRACE_FL_DIRECT_EN;
}
}
+
+ if (flag & FTRACE_FL_CALL_OPS) {
+ if (ftrace_rec_count(rec) == 1) {
+ if (rec->flags & FTRACE_FL_CALL_OPS)
+ rec->flags |= FTRACE_FL_CALL_OPS_EN;
+ else
+ rec->flags &= ~FTRACE_FL_CALL_OPS_EN;
+ } else {
+ /*
+ * Can only call directly if there's
+ * only one set of associated ops.
+ */
+ rec->flags &= ~FTRACE_FL_CALL_OPS_EN;
+ }
+ }
}
/*
@@ -2258,7 +2332,8 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
* and REGS states. The _EN flags must be disabled though.
*/
rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
- FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN);
+ FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN |
+ FTRACE_FL_CALL_OPS_EN);
}
ftrace_bug_type = FTRACE_BUG_NOP;
@@ -2431,6 +2506,25 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
return NULL;
}
+struct ftrace_ops *
+ftrace_find_unique_ops(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op, *found = NULL;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (hash_contains_ip(ip, op->func_hash)) {
+ if (found)
+ return NULL;
+ found = op;
+ }
+
+ } while_for_each_ftrace_op(op);
+
+ return found;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
/* Protected by rcu_tasks for reading, and direct_mutex for writing */
static struct ftrace_hash *direct_functions = EMPTY_HASH;
@@ -2488,14 +2582,13 @@ ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- struct pt_regs *regs = ftrace_get_regs(fregs);
unsigned long addr;
addr = ftrace_find_rec_direct(ip);
if (!addr)
return;
- arch_ftrace_set_direct_caller(regs, addr);
+ arch_ftrace_set_direct_caller(fregs, addr);
}
struct ftrace_ops direct_ops = {
@@ -2763,6 +2856,19 @@ void __weak ftrace_arch_code_modify_post_process(void)
{
}
+static int update_ftrace_func(ftrace_func_t func)
+{
+ static ftrace_func_t save_func;
+
+ /* Avoid updating if it hasn't changed */
+ if (func == save_func)
+ return 0;
+
+ save_func = func;
+
+ return ftrace_update_ftrace_func(func);
+}
+
void ftrace_modify_all_code(int command)
{
int update = command & FTRACE_UPDATE_TRACE_FUNC;
@@ -2783,7 +2889,7 @@ void ftrace_modify_all_code(int command)
* traced.
*/
if (update) {
- err = ftrace_update_ftrace_func(ftrace_ops_list_func);
+ err = update_ftrace_func(ftrace_ops_list_func);
if (FTRACE_WARN_ON(err))
return;
}
@@ -2799,7 +2905,7 @@ void ftrace_modify_all_code(int command)
/* If irqs are disabled, we are in stop machine */
if (!irqs_disabled())
smp_call_function(ftrace_sync_ipi, NULL, 1);
- err = ftrace_update_ftrace_func(ftrace_trace_function);
+ err = update_ftrace_func(ftrace_trace_function);
if (FTRACE_WARN_ON(err))
return;
}
@@ -3071,8 +3177,6 @@ out:
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
- * The same goes for freeing the per_cpu data of the per_cpu
- * ops.
*/
if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
/*
@@ -3770,11 +3874,12 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_ENABLED) {
struct ftrace_ops *ops;
- seq_printf(m, " (%ld)%s%s%s",
+ seq_printf(m, " (%ld)%s%s%s%s",
ftrace_rec_count(rec),
rec->flags & FTRACE_FL_REGS ? " R" : " ",
rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ",
- rec->flags & FTRACE_FL_DIRECT ? " D" : " ");
+ rec->flags & FTRACE_FL_DIRECT ? " D" : " ",
+ rec->flags & FTRACE_FL_CALL_OPS ? " O" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
if (ops) {
@@ -3790,6 +3895,15 @@ static int t_show(struct seq_file *m, void *v)
} else {
add_trampoline_func(m, NULL, rec);
}
+ if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
+ ops = ftrace_find_unique_ops(rec);
+ if (ops) {
+ seq_printf(m, "\tops: %pS (%pS)",
+ ops, ops->func);
+ } else {
+ seq_puts(m, "\tops: ERROR!");
+ }
+ }
if (rec->flags & FTRACE_FL_DIRECT) {
unsigned long direct;
@@ -4193,6 +4307,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
}
found = 1;
}
+ cond_resched();
} while_for_each_ftrace_rec();
out_unlock:
mutex_unlock(&ftrace_lock);
@@ -5828,6 +5943,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi);
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ip is NULL, it fails to update filter.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
@@ -5847,7 +5966,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ips array or any ip specified within is NULL , it fails to update filter.
- */
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
+*/
int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
unsigned int cnt, int remove, int reset)
{
@@ -5889,6 +6012,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
*
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
@@ -5908,6 +6035,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
* Notrace Filters denote which functions should not be enabled when tracing
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
* for tracing.
+ *
+ * This can allocate memory which must be freed before @ops can be freed,
+ * either by removing each filtered addr or by using
+ * ftrace_free_filter(@ops).
*/
int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
int len, int reset)
@@ -7519,8 +7650,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
/*
* Check the following for each ops before calling their func:
* if RCU flag is set, then rcu_is_watching() must be true
- * if PER_CPU is set, then ftrace_function_local_disable()
- * must be false
* Otherwise test if the ip matches the ops filter
*
* If any of the above fails then the op->func() is not executed.
@@ -7570,8 +7699,8 @@ NOKPROBE_SYMBOL(arch_ftrace_ops_list_func);
/*
* If there's only one function registered but it does not support
- * recursion, needs RCU protection and/or requires per cpu handling, then
- * this function will be called by the mcount trampoline.
+ * recursion, needs RCU protection, then this function will be called
+ * by the mcount trampoline.
*/
static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
@@ -8258,6 +8387,10 @@ struct kallsyms_data {
size_t found;
};
+/* This function gets called for all kernel and module symbols
+ * and returns 1 in case we resolved all the requested symbols,
+ * 0 otherwise.
+ */
static int kallsyms_callback(void *data, const char *name,
struct module *mod, unsigned long addr)
{
@@ -8300,17 +8433,19 @@ static int kallsyms_callback(void *data, const char *name,
int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
{
struct kallsyms_data args;
- int err;
+ int found_all;
memset(addrs, 0, sizeof(*addrs) * cnt);
args.addrs = addrs;
args.syms = sorted_syms;
args.cnt = cnt;
args.found = 0;
- err = kallsyms_on_each_symbol(kallsyms_callback, &args);
- if (err < 0)
- return err;
- return args.found == args.cnt ? 0 : -ESRCH;
+
+ found_all = kallsyms_on_each_symbol(kallsyms_callback, &args);
+ if (found_all)
+ return 0;
+ found_all = module_kallsyms_on_each_symbol(NULL, kallsyms_callback, &args);
+ return found_all ? 0 : -ESRCH;
}
#ifdef CONFIG_SYSCTL
diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
index c736487fc0e4..4850fdfe27f1 100644
--- a/kernel/trace/kprobe_event_gen_test.c
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -21,7 +21,7 @@
* Then:
*
* # insmod kernel/trace/kprobe_event_gen_test.ko
- * # cat /sys/kernel/debug/tracing/trace
+ * # cat /sys/kernel/tracing/trace
*
* You should see many instances of the "gen_kprobe_test" and
* "gen_kretprobe_test" events in the trace buffer.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b21bf14bae9b..af50d931b020 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1581,19 +1581,6 @@ static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
}
/**
- * rb_check_list - make sure a pointer to a list has the last bits zero
- */
-static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
- struct list_head *list)
-{
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
- return 1;
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
- return 1;
- return 0;
-}
-
-/**
* rb_check_pages - integrity check of buffer pages
* @cpu_buffer: CPU buffer with pages to test
*
@@ -1602,36 +1589,27 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
*/
static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = cpu_buffer->pages;
- struct buffer_page *bpage, *tmp;
-
- /* Reset the head page if it exists */
- if (cpu_buffer->head_page)
- rb_set_head_page(cpu_buffer);
+ struct list_head *head = rb_list_head(cpu_buffer->pages);
+ struct list_head *tmp;
- rb_head_page_deactivate(cpu_buffer);
-
- if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
- return -1;
- if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(head->next)->prev) != head))
return -1;
- if (rb_check_list(cpu_buffer, head))
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(head->prev)->next) != head))
return -1;
- list_for_each_entry_safe(bpage, tmp, head, list) {
+ for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
if (RB_WARN_ON(cpu_buffer,
- bpage->list.next->prev != &bpage->list))
+ rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
return -1;
+
if (RB_WARN_ON(cpu_buffer,
- bpage->list.prev->next != &bpage->list))
- return -1;
- if (rb_check_list(cpu_buffer, &bpage->list))
+ rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
return -1;
}
- rb_head_page_activate(cpu_buffer);
-
return 0;
}
@@ -2062,8 +2040,10 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *pages = &cpu_buffer->new_pages;
int retries, success;
+ unsigned long flags;
- raw_spin_lock_irq(&cpu_buffer->reader_lock);
+ /* Can be called at early boot up, where interrupts must not been enabled */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
/*
* We are holding the reader lock, so the reader page won't be swapped
* in the ring buffer. Now we are racing with the writer trying to
@@ -2120,7 +2100,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* tracing
*/
RB_WARN_ON(cpu_buffer, !success);
- raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
/* free pages if they weren't inserted */
if (!success) {
@@ -2248,8 +2228,16 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
- schedule_work_on(cpu,
- &cpu_buffer->update_pages_work);
+ /* Run directly if possible. */
+ migrate_disable();
+ if (cpu != smp_processor_id()) {
+ migrate_enable();
+ schedule_work_on(cpu,
+ &cpu_buffer->update_pages_work);
+ } else {
+ update_pages_handler(&cpu_buffer->update_pages_work);
+ migrate_enable();
+ }
}
}
@@ -2298,9 +2286,17 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
- schedule_work_on(cpu_id,
- &cpu_buffer->update_pages_work);
- wait_for_completion(&cpu_buffer->update_done);
+ /* Run directly if possible. */
+ migrate_disable();
+ if (cpu_id == smp_processor_id()) {
+ rb_update_pages(cpu_buffer);
+ migrate_enable();
+ } else {
+ migrate_enable();
+ schedule_work_on(cpu_id,
+ &cpu_buffer->update_pages_work);
+ wait_for_completion(&cpu_buffer->update_done);
+ }
}
cpu_buffer->nr_pages_to_update = 0;
@@ -2868,7 +2864,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
sched_clock_stable() ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n"
+ " echo global > /sys/kernel/tracing/trace_clock\n"
"or add trace_clock=global to the kernel command line\n");
}
@@ -3180,8 +3176,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->entries);
rb_end_commit(cpu_buffer);
@@ -3383,15 +3378,14 @@ void ring_buffer_nest_end(struct trace_buffer *buffer)
*
* Must be paired with ring_buffer_lock_reserve.
*/
-int ring_buffer_unlock_commit(struct trace_buffer *buffer,
- struct ring_buffer_event *event)
+int ring_buffer_unlock_commit(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu = raw_smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
- rb_commit(cpu_buffer, event);
+ rb_commit(cpu_buffer);
rb_wakeups(buffer, cpu_buffer);
@@ -3977,7 +3971,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
memcpy(body, data, length);
- rb_commit(cpu_buffer, event);
+ rb_commit(cpu_buffer);
rb_wakeups(buffer, cpu_buffer);
@@ -5610,11 +5604,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
*/
void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
{
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = data;
struct page *page = virt_to_page(bpage);
unsigned long flags;
+ if (!buffer || !buffer->buffers || !buffer->buffers[cpu])
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+
/* If the page is still in use someplace else, we can't reuse it */
if (page_ref_count(page) > 1)
goto out;
@@ -5998,7 +5997,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
}
out:
- ring_buffer_unlock_commit(data->buffer, event);
+ ring_buffer_unlock_commit(data->buffer);
return 0;
}
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 78e576575b79..aef34673d79d 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -258,7 +258,7 @@ static void ring_buffer_producer(void)
hit++;
entry = ring_buffer_event_data(event);
*entry = smp_processor_id();
- ring_buffer_unlock_commit(buffer, event);
+ ring_buffer_unlock_commit(buffer);
}
}
end_time = ktime_get();
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
index dacc37b62a2c..2e373f2c65ed 100644
--- a/kernel/trace/rv/monitors/wip/wip.h
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -27,7 +27,7 @@ struct automaton_wip {
bool final_states[state_max_wip];
};
-static struct automaton_wip automaton_wip = {
+static const struct automaton_wip automaton_wip = {
.state_names = {
"preemptive",
"non_preemptive"
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
index 118e576b91b4..d0d9c4b8121b 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.h
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -27,7 +27,7 @@ struct automaton_wwnr {
bool final_states[state_max_wwnr];
};
-static struct automaton_wwnr automaton_wwnr = {
+static const struct automaton_wwnr automaton_wwnr = {
.state_names = {
"not_running",
"running"
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 6c97cc2d754a..7e9061828c24 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -516,7 +516,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
struct rv_monitor_def *mdef;
int retval = -EINVAL;
bool enable = true;
- char *ptr = buff;
+ char *ptr;
int len;
if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1)
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 8d77526892f4..8dfe85499d4a 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -22,7 +22,7 @@
* Then:
*
* # insmod kernel/trace/synth_event_gen_test.ko
- * # cat /sys/kernel/debug/tracing/trace
+ * # cat /sys/kernel/tracing/trace
*
* You should see several events in the trace buffer -
* "create_synth_test", "empty_synth_test", and several instances of
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5cfc95a52bc3..45551c7b4c36 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -19,7 +19,6 @@
#include <linux/kallsyms.h>
#include <linux/security.h>
#include <linux/seq_file.h>
-#include <linux/notifier.h>
#include <linux/irqflags.h>
#include <linux/debugfs.h>
#include <linux/tracefs.h>
@@ -50,6 +49,8 @@
#include <linux/irq_work.h>
#include <linux/workqueue.h>
+#include <asm/setup.h> /* COMMAND_LINE_SIZE */
+
#include "trace.h"
#include "trace_output.h"
@@ -85,7 +86,7 @@ void __init disable_tracing_selftest(const char *reason)
#endif
/* Pipe tracepoints to printk */
-struct trace_iterator *tracepoint_print_iter;
+static struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
static bool tracepoint_printk_stop_on_boot __initdata;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
@@ -187,6 +188,12 @@ static char *default_bootup_tracer;
static bool allocate_snapshot;
static bool snapshot_at_boot;
+static char boot_instance_info[COMMAND_LINE_SIZE] __initdata;
+static int boot_instance_index;
+
+static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
+static int boot_snapshot_index;
+
static int __init set_cmdline_ftrace(char *str)
{
strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
@@ -223,9 +230,22 @@ __setup("traceoff_on_warning", stop_trace_on_warning);
static int __init boot_alloc_snapshot(char *str)
{
- allocate_snapshot = true;
- /* We also need the main ring buffer expanded */
- ring_buffer_expanded = true;
+ char *slot = boot_snapshot_info + boot_snapshot_index;
+ int left = sizeof(boot_snapshot_info) - boot_snapshot_index;
+ int ret;
+
+ if (str[0] == '=') {
+ str++;
+ if (strlen(str) >= left)
+ return -1;
+
+ ret = snprintf(slot, left, "%s\t", str);
+ boot_snapshot_index += ret;
+ } else {
+ allocate_snapshot = true;
+ /* We also need the main ring buffer expanded */
+ ring_buffer_expanded = true;
+ }
return 1;
}
__setup("alloc_snapshot", boot_alloc_snapshot);
@@ -240,6 +260,23 @@ static int __init boot_snapshot(char *str)
__setup("ftrace_boot_snapshot", boot_snapshot);
+static int __init boot_instance(char *str)
+{
+ char *slot = boot_instance_info + boot_instance_index;
+ int left = sizeof(boot_instance_info) - boot_instance_index;
+ int ret;
+
+ if (strlen(str) >= left)
+ return -1;
+
+ ret = snprintf(slot, left, "%s\t", str);
+ boot_instance_index += ret;
+
+ return 1;
+}
+__setup("trace_instance=", boot_instance);
+
+
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
@@ -999,16 +1036,11 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev
/* ring_buffer_unlock_commit() enables preemption */
preempt_enable_notrace();
} else
- ring_buffer_unlock_commit(buffer, event);
+ ring_buffer_unlock_commit(buffer);
}
-/**
- * __trace_puts - write a constant string into the trace buffer.
- * @ip: The address of the caller
- * @str: The constant string to write
- * @size: The size of the string.
- */
-int __trace_puts(unsigned long ip, const char *str, int size)
+int __trace_array_puts(struct trace_array *tr, unsigned long ip,
+ const char *str, int size)
{
struct ring_buffer_event *event;
struct trace_buffer *buffer;
@@ -1016,7 +1048,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
unsigned int trace_ctx;
int alloc;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
return 0;
if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1025,7 +1057,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
alloc = sizeof(*entry) + size + 2; /* possible \n added */
trace_ctx = tracing_gen_ctx();
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
trace_ctx);
@@ -1047,11 +1079,23 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
out:
ring_buffer_nest_end(buffer);
return size;
}
+EXPORT_SYMBOL_GPL(__trace_array_puts);
+
+/**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip: The address of the caller
+ * @str: The constant string to write
+ * @size: The size of the string.
+ */
+int __trace_puts(unsigned long ip, const char *str, int size)
+{
+ return __trace_array_puts(&global_trace, ip, str, size);
+}
EXPORT_SYMBOL_GPL(__trace_puts);
/**
@@ -1143,7 +1187,7 @@ void tracing_snapshot_instance(struct trace_array *tr)
*
* Note, make sure to allocate the snapshot with either
* a tracing_snapshot_alloc(), or by doing it manually
- * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ * with: echo 1 > /sys/kernel/tracing/snapshot
*
* If the snapshot buffer is not allocated, it will stop tracing.
* Basically making a permanent snapshot.
@@ -1421,6 +1465,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
return false;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
+#define free_snapshot(tr) do { } while (0)
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1692,6 +1737,8 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
}
unsigned long __read_mostly tracing_thresh;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops;
#ifdef LATENCY_FS_NOTIFY
@@ -1748,18 +1795,14 @@ void latency_fsnotify(struct trace_array *tr)
irq_work_queue(&tr->fsnotify_irqwork);
}
-#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
- || defined(CONFIG_OSNOISE_TRACER)
+#else /* !LATENCY_FS_NOTIFY */
#define trace_create_maxlat_file(tr, d_tracer) \
trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \
d_tracer, &tr->max_latency, &tracing_max_lat_fops)
-#else
-#define trace_create_maxlat_file(tr, d_tracer) do { } while (0)
#endif
-#ifdef CONFIG_TRACER_MAX_TRACE
/*
* Copy the new maximum trace into the separate maximum-trace
* structure. (this way the maximum trace is permanently saved,
@@ -1834,14 +1877,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
ring_buffer_record_off(tr->max_buffer.buffer);
#ifdef CONFIG_TRACER_SNAPSHOT
- if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data))
- goto out_unlock;
+ if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
+ arch_spin_unlock(&tr->max_lock);
+ return;
+ }
#endif
swap(tr->array_buffer.buffer, tr->max_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
- out_unlock:
arch_spin_unlock(&tr->max_lock);
}
@@ -1888,6 +1932,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&tr->max_lock);
}
+
#endif /* CONFIG_TRACER_MAX_TRACE */
static int wait_on_pipe(struct trace_iterator *iter, int full)
@@ -3128,6 +3173,9 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
return;
}
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY)))
+ return;
+
/*
* When an NMI triggers, RCU is enabled via ct_nmi_enter(),
* but if the above rcu_is_watching() failed, then the NMI
@@ -5598,7 +5646,7 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
"\t s:[synthetic/]<event> <field> [<field>]\n"
#endif
- "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>]\n"
+ "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n"
"\t -:[<group>/][<event>]\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -5615,9 +5663,9 @@ static const char readme_msg[] =
"\t $stack<index>, $stack, $retval, $comm,\n"
#endif
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
- "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n"
+ "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
- "\t <type>\\[<array-size>\\]\n"
+ "\t symstr, <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
"\t field: <stype> <name>;\n"
"\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
@@ -5678,6 +5726,7 @@ static const char readme_msg[] =
"\t [:size=#entries]\n"
"\t [:pause][:continue][:clear]\n"
"\t [:name=histname1]\n"
+ "\t [:nohitcount]\n"
"\t [:<handler>.<action>]\n"
"\t [if <filter>]\n\n"
"\t Note, special fields can be used as well:\n"
@@ -5724,7 +5773,9 @@ static const char readme_msg[] =
"\t .syscall display a syscall id as a syscall name\n"
"\t .log2 display log2 value rather than raw number\n"
"\t .buckets=size display values in groups of size rather than raw number\n"
- "\t .usecs display a common_timestamp in microseconds\n\n"
+ "\t .usecs display a common_timestamp in microseconds\n"
+ "\t .percent display a number of percentage value\n"
+ "\t .graph display a bar-graph of a value\n\n"
"\t The 'pause' parameter can be used to pause an existing hist\n"
"\t trigger or to start a hist trigger but not log any events\n"
"\t until told to do so. 'continue' can be used to start or\n"
@@ -5732,6 +5783,8 @@ static const char readme_msg[] =
"\t The 'clear' parameter will clear the contents of a running\n"
"\t hist trigger and leave its current paused/active state\n"
"\t unchanged.\n\n"
+ "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n"
+ "\t raw hitcount in the histogram.\n\n"
"\t The enable_hist and disable_hist triggers can be used to\n"
"\t have one event conditionally start and stop another event's\n"
"\t already-attached hist trigger. The syntax is analogous to\n"
@@ -5752,7 +5805,7 @@ static const char readme_msg[] =
#ifdef CONFIG_SYNTH_EVENTS
" events/synthetic_events\t- Create/append/remove/show synthetic events\n"
"\t Write into this file to define/undefine new synthetic events.\n"
- "\t example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n"
+ "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n"
#endif
#endif
;
@@ -6572,7 +6625,7 @@ out:
return ret;
}
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#ifdef CONFIG_TRACER_MAX_TRACE
static ssize_t
tracing_max_lat_read(struct file *filp, char __user *ubuf,
@@ -6796,7 +6849,20 @@ waitagain:
ret = print_trace_line(iter);
if (ret == TRACE_TYPE_PARTIAL_LINE) {
- /* don't print partial lines */
+ /*
+ * If one print_trace_line() fills entire trace_seq in one shot,
+ * trace_seq_to_user() will returns -EBUSY because save_len == 0,
+ * In this case, we need to consume it, otherwise, loop will peek
+ * this event next time, resulting in an infinite loop.
+ */
+ if (save_len == 0) {
+ iter->seq.full = 0;
+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+ trace_consume(iter);
+ break;
+ }
+
+ /* In other cases, don't print partial lines */
iter->seq.seq.len = save_len;
break;
}
@@ -7587,7 +7653,7 @@ static const struct file_operations tracing_thresh_fops = {
.llseek = generic_file_llseek,
};
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
+#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -9130,9 +9196,6 @@ buffer_percent_write(struct file *filp, const char __user *ubuf,
if (val > 100)
return -EINVAL;
- if (!val)
- val = 1;
-
tr->buffer_percent = val;
(*ppos)++;
@@ -9207,10 +9270,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
}
tr->allocated_snapshot = allocate_snapshot;
- /*
- * Only the top level trace array gets its snapshot allocated
- * from the kernel command line.
- */
allocate_snapshot = false;
#endif
@@ -9601,7 +9660,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);
+#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_maxlat_file(tr, d_tracer);
+#endif
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
@@ -9855,41 +9916,41 @@ static __init int tracer_init_tracefs(void)
fs_initcall(tracer_init_tracefs);
-static int trace_panic_handler(struct notifier_block *this,
- unsigned long event, void *unused)
-{
- if (ftrace_dump_on_oops)
- ftrace_dump(ftrace_dump_on_oops);
- return NOTIFY_OK;
-}
+static int trace_die_panic_handler(struct notifier_block *self,
+ unsigned long ev, void *unused);
static struct notifier_block trace_panic_notifier = {
- .notifier_call = trace_panic_handler,
- .next = NULL,
- .priority = 150 /* priority: INT_MAX >= x >= 0 */
+ .notifier_call = trace_die_panic_handler,
+ .priority = INT_MAX - 1,
};
-static int trace_die_handler(struct notifier_block *self,
- unsigned long val,
- void *data)
-{
- switch (val) {
- case DIE_OOPS:
- if (ftrace_dump_on_oops)
- ftrace_dump(ftrace_dump_on_oops);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
static struct notifier_block trace_die_notifier = {
- .notifier_call = trace_die_handler,
- .priority = 200
+ .notifier_call = trace_die_panic_handler,
+ .priority = INT_MAX - 1,
};
/*
+ * The idea is to execute the following die/panic callback early, in order
+ * to avoid showing irrelevant information in the trace (like other panic
+ * notifier functions); we are the 2nd to run, after hung_task/rcu_stall
+ * warnings get disabled (to prevent potential log flooding).
+ */
+static int trace_die_panic_handler(struct notifier_block *self,
+ unsigned long ev, void *unused)
+{
+ if (!ftrace_dump_on_oops)
+ return NOTIFY_DONE;
+
+ /* The die notifier requires DIE_OOPS to trigger */
+ if (self == &trace_die_notifier && ev != DIE_OOPS)
+ return NOTIFY_DONE;
+
+ ftrace_dump(ftrace_dump_on_oops);
+
+ return NOTIFY_DONE;
+}
+
+/*
* printk is set to max of 1024, we really don't need it that big.
* Nothing should be printing 1000 characters anyway.
*/
@@ -10124,6 +10185,79 @@ out:
return ret;
}
+#ifdef CONFIG_TRACER_MAX_TRACE
+__init static bool tr_needs_alloc_snapshot(const char *name)
+{
+ char *test;
+ int len = strlen(name);
+ bool ret;
+
+ if (!boot_snapshot_index)
+ return false;
+
+ if (strncmp(name, boot_snapshot_info, len) == 0 &&
+ boot_snapshot_info[len] == '\t')
+ return true;
+
+ test = kmalloc(strlen(name) + 3, GFP_KERNEL);
+ if (!test)
+ return false;
+
+ sprintf(test, "\t%s\t", name);
+ ret = strstr(boot_snapshot_info, test) == NULL;
+ kfree(test);
+ return ret;
+}
+
+__init static void do_allocate_snapshot(const char *name)
+{
+ if (!tr_needs_alloc_snapshot(name))
+ return;
+
+ /*
+ * When allocate_snapshot is set, the next call to
+ * allocate_trace_buffers() (called by trace_array_get_by_name())
+ * will allocate the snapshot buffer. That will alse clear
+ * this flag.
+ */
+ allocate_snapshot = true;
+}
+#else
+static inline void do_allocate_snapshot(const char *name) { }
+#endif
+
+__init static void enable_instances(void)
+{
+ struct trace_array *tr;
+ char *curr_str;
+ char *str;
+ char *tok;
+
+ /* A tab is always appended */
+ boot_instance_info[boot_instance_index - 1] = '\0';
+ str = boot_instance_info;
+
+ while ((curr_str = strsep(&str, "\t"))) {
+
+ tok = strsep(&curr_str, ",");
+
+ if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
+ do_allocate_snapshot(tok);
+
+ tr = trace_array_get_by_name(tok);
+ if (!tr) {
+ pr_warn("Failed to create instance buffer %s\n", curr_str);
+ continue;
+ }
+ /* Allow user space to delete it */
+ trace_array_put(tr);
+
+ while ((tok = strsep(&curr_str, ","))) {
+ early_enable_events(tr, tok, true);
+ }
+ }
+}
+
__init static int tracer_alloc_buffers(void)
{
int ring_buf_size;
@@ -10257,10 +10391,19 @@ out:
void __init ftrace_boot_snapshot(void)
{
+ struct trace_array *tr;
+
if (snapshot_at_boot) {
tracing_snapshot();
internal_trace_puts("** Boot snapshot taken **\n");
}
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr == &global_trace)
+ continue;
+ trace_array_puts(tr, "** Boot snapshot taken **\n");
+ tracing_snapshot_instance(tr);
+ }
}
void __init early_trace_init(void)
@@ -10275,11 +10418,16 @@ void __init early_trace_init(void)
static_key_enable(&tracepoint_printk_key.key);
}
tracer_alloc_buffers();
+
+ init_events();
}
void __init trace_init(void)
{
trace_event_init();
+
+ if (boot_instance_index)
+ enable_instances();
}
__init static void clear_boot_tracer(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d42e24507152..f3aae2be1d53 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -113,6 +113,10 @@ enum trace_type {
#define MEM_FAIL(condition, fmt, ...) \
DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__)
+#define HIST_STACKTRACE_DEPTH 16
+#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
+#define HIST_STACKTRACE_SKIP 5
+
/*
* syscalls are special, and need special handling, this is why
* they are not included in trace_entries.h
@@ -308,8 +312,7 @@ struct trace_array {
struct array_buffer max_buffer;
bool allocated_snapshot;
#endif
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
- || defined(CONFIG_OSNOISE_TRACER)
+#ifdef CONFIG_TRACER_MAX_TRACE
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -615,7 +618,7 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
bool trace_is_tracepoint_string(const char *str);
const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
- va_list ap);
+ va_list ap) __printf(2, 0);
int trace_empty(struct trace_iterator *iter);
@@ -688,12 +691,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
void *cond_data);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
-#endif /* CONFIG_TRACER_MAX_TRACE */
-#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
- || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY)
+#ifdef CONFIG_FSNOTIFY
#define LATENCY_FS_NOTIFY
#endif
+#endif /* CONFIG_TRACER_MAX_TRACE */
#ifdef LATENCY_FS_NOTIFY
void latency_fsnotify(struct trace_array *tr);
@@ -1284,6 +1286,7 @@ struct ftrace_event_field {
int offset;
int size;
int is_signed;
+ int len;
};
struct prog_entry;
@@ -1332,6 +1335,8 @@ DECLARE_PER_CPU(int, trace_buffered_event_cnt);
void trace_buffered_event_disable(void);
void trace_buffered_event_enable(void);
+void early_enable_events(struct trace_array *tr, char *buf, bool disable_first);
+
static inline void
__trace_event_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
@@ -1492,6 +1497,7 @@ extern void trace_event_enable_cmd_record(bool enable);
extern void trace_event_enable_tgid_record(bool enable);
extern int event_trace_init(void);
+extern int init_events(void);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
extern void __trace_early_add_events(struct trace_array *tr);
@@ -1942,8 +1948,6 @@ static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { }
static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
#endif
-extern struct trace_iterator *tracepoint_print_iter;
-
/*
* Reset the state of the trace_iterator so that it can read consumed data.
* Normally, the trace_iterator is used for reading the data when it is not
@@ -1956,17 +1960,30 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
}
/* Check the name is good for event/group/fields */
-static inline bool is_good_name(const char *name)
+static inline bool __is_good_name(const char *name, bool hash_ok)
{
- if (!isalpha(*name) && *name != '_')
+ if (!isalpha(*name) && *name != '_' && (!hash_ok || *name != '-'))
return false;
while (*++name != '\0') {
- if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+ if (!isalpha(*name) && !isdigit(*name) && *name != '_' &&
+ (!hash_ok || *name != '-'))
return false;
}
return true;
}
+/* Check the name is good for event/group/fields */
+static inline bool is_good_name(const char *name)
+{
+ return __is_good_name(name, false);
+}
+
+/* Check the name is good for system */
+static inline bool is_good_system_name(const char *name)
+{
+ return __is_good_name(name, true);
+}
+
/* Convert certain expected symbols into '_' when generating event names */
static inline void sanitize_event_name(char *name)
{
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 352b65e2b910..594ac1d086aa 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -923,17 +923,13 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch
p = ep->filter_str;
for (i = 0; i < argc; i++) {
- ret = snprintf(p, len, "%s ", argv[i]);
- if (ret < 0)
- goto error;
- if (ret > len) {
- ret = -E2BIG;
- goto error;
- }
+ if (i)
+ ret = snprintf(p, len, " %s", argv[i]);
+ else
+ ret = snprintf(p, len, "%s", argv[i]);
p += ret;
len -= ret;
}
- p[-1] = '\0';
/*
* Ensure the filter string can be parsed correctly. Note, this
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 61e3a2620fa3..05e791241812 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -251,16 +251,12 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
struct trace_event_call *tp_event;
if (p_event->attr.kprobe_func) {
- func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
- if (!func)
- return -ENOMEM;
- ret = strncpy_from_user(
- func, u64_to_user_ptr(p_event->attr.kprobe_func),
- KSYM_NAME_LEN);
- if (ret == KSYM_NAME_LEN)
- ret = -E2BIG;
- if (ret < 0)
- goto out;
+ func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func),
+ KSYM_NAME_LEN);
+ if (IS_ERR(func)) {
+ ret = PTR_ERR(func);
+ return (ret == -EINVAL) ? -E2BIG : ret;
+ }
if (func[0] == '\0') {
kfree(func);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f71ea6e79b3c..654ffa40457a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -114,7 +114,7 @@ trace_find_event_field(struct trace_event_call *call, char *name)
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
- int is_signed, int filter_type)
+ int is_signed, int filter_type, int len)
{
struct ftrace_event_field *field;
@@ -133,6 +133,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
field->offset = offset;
field->size = size;
field->is_signed = is_signed;
+ field->len = len;
list_add(&field->link, head);
@@ -150,14 +151,28 @@ int trace_define_field(struct trace_event_call *call, const char *type,
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type);
+ is_signed, filter_type, 0);
}
EXPORT_SYMBOL_GPL(trace_define_field);
+static int trace_define_field_ext(struct trace_event_call *call, const char *type,
+ const char *name, int offset, int size, int is_signed,
+ int filter_type, int len)
+{
+ struct list_head *head;
+
+ if (WARN_ON(!call->class))
+ return 0;
+
+ head = trace_get_fields(call);
+ return __trace_define_field(head, type, name, offset, size,
+ is_signed, filter_type, len);
+}
+
#define __generic_field(type, item, filter_type) \
ret = __trace_define_field(&ftrace_generic_fields, #type, \
#item, 0, 0, is_signed_type(type), \
- filter_type); \
+ filter_type, 0); \
if (ret) \
return ret;
@@ -166,7 +181,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
"common_" #item, \
offsetof(typeof(ent), item), \
sizeof(ent.item), \
- is_signed_type(type), FILTER_OTHER); \
+ is_signed_type(type), FILTER_OTHER, 0); \
if (ret) \
return ret;
@@ -1588,12 +1603,17 @@ static int f_show(struct seq_file *m, void *v)
seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
field->type, field->name, field->offset,
field->size, !!field->is_signed);
- else
- seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ else if (field->len)
+ seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
(int)(array_descriptor - field->type),
field->type, field->name,
- array_descriptor, field->offset,
+ field->len, field->offset,
field->size, !!field->is_signed);
+ else
+ seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ (int)(array_descriptor - field->type),
+ field->type, field->name,
+ field->offset, field->size, !!field->is_signed);
return 0;
}
@@ -2261,8 +2281,6 @@ create_new_subsystem(const char *name)
if (!system->name)
goto out_free;
- system->filter = NULL;
-
system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
if (!system->filter)
goto out_free;
@@ -2379,9 +2397,10 @@ event_define_fields(struct trace_event_call *call)
}
offset = ALIGN(offset, field->align);
- ret = trace_define_field(call, field->type, field->name,
+ ret = trace_define_field_ext(call, field->type, field->name,
offset, field->size,
- field->is_signed, field->filter_type);
+ field->is_signed, field->filter_type,
+ field->len);
if (WARN_ON_ONCE(ret)) {
pr_err("error code is %d\n", ret);
break;
@@ -2796,6 +2815,42 @@ trace_create_new_event(struct trace_event_call *call,
return file;
}
+#define MAX_BOOT_TRIGGERS 32
+
+static struct boot_triggers {
+ const char *event;
+ char *trigger;
+} bootup_triggers[MAX_BOOT_TRIGGERS];
+
+static char bootup_trigger_buf[COMMAND_LINE_SIZE];
+static int nr_boot_triggers;
+
+static __init int setup_trace_triggers(char *str)
+{
+ char *trigger;
+ char *buf;
+ int i;
+
+ strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
+ ring_buffer_expanded = true;
+ disable_tracing_selftest("running event triggers");
+
+ buf = bootup_trigger_buf;
+ for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
+ trigger = strsep(&buf, ",");
+ if (!trigger)
+ break;
+ bootup_triggers[i].event = strsep(&trigger, ".");
+ bootup_triggers[i].trigger = trigger;
+ if (!bootup_triggers[i].trigger)
+ break;
+ }
+
+ nr_boot_triggers = i;
+ return 1;
+}
+__setup("trace_trigger=", setup_trace_triggers);
+
/* Add an event to a trace directory */
static int
__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
@@ -2812,6 +2867,24 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
return event_define_fields(call);
}
+static void trace_early_triggers(struct trace_event_file *file, const char *name)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < nr_boot_triggers; i++) {
+ if (strcmp(name, bootup_triggers[i].event))
+ continue;
+ mutex_lock(&event_mutex);
+ ret = trigger_process_regex(file, bootup_triggers[i].trigger);
+ mutex_unlock(&event_mutex);
+ if (ret)
+ pr_err("Failed to register trigger '%s' on event %s\n",
+ bootup_triggers[i].trigger,
+ bootup_triggers[i].event);
+ }
+}
+
/*
* Just create a descriptor for early init. A descriptor is required
* for enabling events at boot. We want to enable events before
@@ -2822,12 +2895,19 @@ __trace_early_add_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
struct trace_event_file *file;
+ int ret;
file = trace_create_new_event(call, tr);
if (!file)
return -ENOMEM;
- return event_define_fields(call);
+ ret = event_define_fields(call);
+ if (ret)
+ return ret;
+
+ trace_early_triggers(file, trace_event_name(call));
+
+ return 0;
}
struct ftrace_module_file_ops;
@@ -3689,10 +3769,9 @@ static __init int event_trace_memsetup(void)
return 0;
}
-static __init void
-early_enable_events(struct trace_array *tr, bool disable_first)
+__init void
+early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
{
- char *buf = bootup_event_buf;
char *token;
int ret;
@@ -3735,6 +3814,8 @@ static __init int event_trace_enable(void)
list_add(&call->list, &ftrace_events);
}
+ register_trigger_cmds();
+
/*
* We need the top trace array to have a working set of trace
* points at early init, before the debug files and directories
@@ -3743,13 +3824,12 @@ static __init int event_trace_enable(void)
*/
__trace_early_add_events(tr);
- early_enable_events(tr, false);
+ early_enable_events(tr, bootup_event_buf, false);
trace_printk_start_comm();
register_event_cmds();
- register_trigger_cmds();
return 0;
}
@@ -3772,7 +3852,7 @@ static __init int event_trace_enable_again(void)
if (!tr)
return -ENODEV;
- early_enable_events(tr, true);
+ early_enable_events(tr, bootup_event_buf, true);
return 0;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 96acc2b71ac7..1dad64267878 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -64,6 +64,7 @@ enum filter_pred_fn {
FILTER_PRED_FN_PCHAR_USER,
FILTER_PRED_FN_PCHAR,
FILTER_PRED_FN_CPU,
+ FILTER_PRED_FN_FUNCTION,
FILTER_PRED_FN_,
FILTER_PRED_TEST_VISITED,
};
@@ -71,6 +72,7 @@ enum filter_pred_fn {
struct filter_pred {
enum filter_pred_fn fn_num;
u64 val;
+ u64 val2;
struct regex regex;
unsigned short *ops;
struct ftrace_event_field *field;
@@ -103,6 +105,7 @@ struct filter_pred {
C(INVALID_FILTER, "Meaningless filter expression"), \
C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \
+ C(NO_FUNCTION, "Function not found"), \
C(ERRNO, "Error"), \
C(NO_FILTER, "No filter found")
@@ -128,7 +131,7 @@ static bool is_not(const char *str)
}
/**
- * prog_entry - a singe entry in the filter program
+ * struct prog_entry - a singe entry in the filter program
* @target: Index to jump to on a branch (actually one minus the index)
* @when_to_branch: The value of the result of the predicate to do a branch
* @pred: The predicate to execute.
@@ -140,16 +143,16 @@ struct prog_entry {
};
/**
- * update_preds- assign a program entry a label target
+ * update_preds - assign a program entry a label target
* @prog: The program array
* @N: The index of the current entry in @prog
- * @when_to_branch: What to assign a program entry for its branch condition
+ * @invert: What to assign a program entry for its branch condition
*
* The program entry at @N has a target that points to the index of a program
* entry that can have its target and when_to_branch fields updated.
* Update the current program entry denoted by index @N target field to be
* that of the updated entry. This will denote the entry to update if
- * we are processing an "||" after an "&&"
+ * we are processing an "||" after an "&&".
*/
static void update_preds(struct prog_entry *prog, int N, int invert)
{
@@ -876,6 +879,17 @@ static int filter_pred_comm(struct filter_pred *pred, void *event)
return cmp ^ pred->not;
}
+/* Filter predicate for functions. */
+static int filter_pred_function(struct filter_pred *pred, void *event)
+{
+ unsigned long *addr = (unsigned long *)(event + pred->offset);
+ unsigned long start = (unsigned long)pred->val;
+ unsigned long end = (unsigned long)pred->val2;
+ int ret = *addr >= start && *addr < end;
+
+ return pred->op == OP_EQ ? ret : !ret;
+}
+
/*
* regex_match_foo - Basic regex callbacks
*
@@ -1335,6 +1349,8 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event)
return filter_pred_pchar(pred, event);
case FILTER_PRED_FN_CPU:
return filter_pred_cpu(pred, event);
+ case FILTER_PRED_FN_FUNCTION:
+ return filter_pred_function(pred, event);
case FILTER_PRED_TEST_VISITED:
return test_pred_visited_fn(pred, event);
default:
@@ -1350,8 +1366,13 @@ static int parse_pred(const char *str, void *data,
struct trace_event_call *call = data;
struct ftrace_event_field *field;
struct filter_pred *pred = NULL;
+ unsigned long offset;
+ unsigned long size;
+ unsigned long ip;
char num_buf[24]; /* Big enough to hold an address */
char *field_name;
+ char *name;
+ bool function = false;
bool ustring = false;
char q;
u64 val;
@@ -1393,6 +1414,12 @@ static int parse_pred(const char *str, void *data,
i += len;
}
+ /* See if the field is a kernel function name */
+ if ((len = str_has_prefix(str + i, ".function"))) {
+ function = true;
+ i += len;
+ }
+
while (isspace(str[i]))
i++;
@@ -1423,7 +1450,71 @@ static int parse_pred(const char *str, void *data,
pred->offset = field->offset;
pred->op = op;
- if (ftrace_event_is_function(call)) {
+ if (function) {
+ /* The field must be the same size as long */
+ if (field->size != sizeof(long)) {
+ parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+ goto err_free;
+ }
+
+ /* Function only works with '==' or '!=' and an unquoted string */
+ switch (op) {
+ case OP_NE:
+ case OP_EQ:
+ break;
+ default:
+ parse_error(pe, FILT_ERR_INVALID_OP, pos + i);
+ goto err_free;
+ }
+
+ if (isdigit(str[i])) {
+ /* We allow 0xDEADBEEF */
+ while (isalnum(str[i]))
+ i++;
+
+ len = i - s;
+ /* 0xfeedfacedeadbeef is 18 chars max */
+ if (len >= sizeof(num_buf)) {
+ parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+ goto err_free;
+ }
+
+ strncpy(num_buf, str + s, len);
+ num_buf[len] = 0;
+
+ ret = kstrtoul(num_buf, 0, &ip);
+ if (ret) {
+ parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i);
+ goto err_free;
+ }
+ } else {
+ s = i;
+ for (; str[i] && !isspace(str[i]); i++)
+ ;
+
+ len = i - s;
+ name = kmemdup_nul(str + s, len, GFP_KERNEL);
+ if (!name)
+ goto err_mem;
+ ip = kallsyms_lookup_name(name);
+ kfree(name);
+ if (!ip) {
+ parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i);
+ goto err_free;
+ }
+ }
+
+ /* Now find the function start and end address */
+ if (!kallsyms_lookup_size_offset(ip, &size, &offset)) {
+ parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i);
+ goto err_free;
+ }
+
+ pred->fn_num = FILTER_PRED_FN_FUNCTION;
+ pred->val = ip - offset;
+ pred->val2 = pred->val + size;
+
+ } else if (ftrace_event_is_function(call)) {
/*
* Perf does things different with function events.
* It only allows an "ip" field, and expects a string.
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1c82478e8dff..89877a18f933 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -69,7 +69,8 @@
C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \
C(EXPECT_NUMBER, "Expecting numeric literal"), \
C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \
- C(DIVISION_BY_ZERO, "Division by zero"),
+ C(DIVISION_BY_ZERO, "Division by zero"), \
+ C(NEED_NOHC_VAL, "Non-hitcount value is required for 'nohitcount'"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -134,6 +135,7 @@ enum hist_field_fn {
HIST_FIELD_FN_DIV_NOT_POWER2,
HIST_FIELD_FN_DIV_MULT_SHIFT,
HIST_FIELD_FN_EXECNAME,
+ HIST_FIELD_FN_STACK,
};
/*
@@ -479,10 +481,6 @@ DEFINE_HIST_FIELD_FN(u8);
#define for_each_hist_key_field(i, hist_data) \
for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
-#define HIST_STACKTRACE_DEPTH 16
-#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
-#define HIST_STACKTRACE_SKIP 5
-
#define HITCOUNT_IDX 0
#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
@@ -506,6 +504,8 @@ enum hist_field_flags {
HIST_FIELD_FL_ALIAS = 1 << 16,
HIST_FIELD_FL_BUCKET = 1 << 17,
HIST_FIELD_FL_CONST = 1 << 18,
+ HIST_FIELD_FL_PERCENT = 1 << 19,
+ HIST_FIELD_FL_GRAPH = 1 << 20,
};
struct var_defs {
@@ -524,6 +524,7 @@ struct hist_trigger_attrs {
bool cont;
bool clear;
bool ts_in_usecs;
+ bool no_hitcount;
unsigned int map_bits;
char *assignment_str[TRACING_MAP_VARS_MAX];
@@ -617,7 +618,7 @@ struct action_data {
* event param, and is passed to the synthetic event
* invocation.
*/
- unsigned int var_ref_idx[TRACING_MAP_VARS_MAX];
+ unsigned int var_ref_idx[SYNTH_FIELDS_MAX];
struct synth_event *synth_event;
bool use_trace_keyword;
char *synth_event_name;
@@ -1356,6 +1357,13 @@ static const char *hist_field_name(struct hist_field *field,
field_name = field->name;
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
field_name = "common_timestamp";
+ else if (field->flags & HIST_FIELD_FL_STACKTRACE) {
+ if (field->field)
+ field_name = field->field->name;
+ else
+ field_name = "stacktrace";
+ } else if (field->flags & HIST_FIELD_FL_HITCOUNT)
+ field_name = "hitcount";
if (field_name == NULL)
field_name = "";
@@ -1546,7 +1554,10 @@ parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
ret = parse_assignment(tr, str, attrs);
if (ret)
goto free;
- } else if (strcmp(str, "pause") == 0)
+ } else if (strcmp(str, "nohitcount") == 0 ||
+ strcmp(str, "NOHC") == 0)
+ attrs->no_hitcount = true;
+ else if (strcmp(str, "pause") == 0)
attrs->pause = true;
else if ((strcmp(str, "cont") == 0) ||
(strcmp(str, "continue") == 0))
@@ -1705,6 +1716,12 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
flags_str = "buckets";
else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
flags_str = "usecs";
+ else if (hist_field->flags & HIST_FIELD_FL_PERCENT)
+ flags_str = "percent";
+ else if (hist_field->flags & HIST_FIELD_FL_GRAPH)
+ flags_str = "graph";
+ else if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
+ flags_str = "stacktrace";
return flags_str;
}
@@ -1966,7 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
}
if (flags & HIST_FIELD_FL_STACKTRACE) {
- hist_field->fn_num = HIST_FIELD_FN_NOP;
+ if (field)
+ hist_field->fn_num = HIST_FIELD_FN_STACK;
+ else
+ hist_field->fn_num = HIST_FIELD_FN_NOP;
+ hist_field->size = HIST_STACKTRACE_SIZE;
+ hist_field->type = kstrdup_const("unsigned long[]", GFP_KERNEL);
+ if (!hist_field->type)
+ goto free;
goto out;
}
@@ -1975,6 +1999,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :
HIST_FIELD_FN_BUCKET;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
+ if (!hist_field->operands[0])
+ goto free;
hist_field->size = hist_field->operands[0]->size;
hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
if (!hist_field->type)
@@ -2173,7 +2199,9 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data,
return ref_field;
}
}
-
+ /* Sanity check to avoid out-of-bound write on 'hist_data->var_refs' */
+ if (hist_data->n_var_refs >= TRACING_MAP_VARS_MAX)
+ return NULL;
ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
if (ref_field) {
if (init_var_ref(ref_field, var_field, system, event_name)) {
@@ -2295,6 +2323,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_EXECNAME;
else if (strcmp(modifier, "syscall") == 0)
*flags |= HIST_FIELD_FL_SYSCALL;
+ else if (strcmp(modifier, "stacktrace") == 0)
+ *flags |= HIST_FIELD_FL_STACKTRACE;
else if (strcmp(modifier, "log2") == 0)
*flags |= HIST_FIELD_FL_LOG2;
else if (strcmp(modifier, "usecs") == 0)
@@ -2313,6 +2343,14 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
if (ret || !(*buckets))
goto error;
*flags |= HIST_FIELD_FL_BUCKET;
+ } else if (strncmp(modifier, "percent", 7) == 0) {
+ if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY))
+ goto error;
+ *flags |= HIST_FIELD_FL_PERCENT;
+ } else if (strncmp(modifier, "graph", 5) == 0) {
+ if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY))
+ goto error;
+ *flags |= HIST_FIELD_FL_GRAPH;
} else {
error:
hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
@@ -2326,8 +2364,12 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->enable_timestamps = true;
if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
hist_data->attrs->ts_in_usecs = true;
+ } else if (strcmp(field_name, "stacktrace") == 0) {
+ *flags |= HIST_FIELD_FL_STACKTRACE;
} else if (strcmp(field_name, "common_cpu") == 0)
*flags |= HIST_FIELD_FL_CPU;
+ else if (strcmp(field_name, "hitcount") == 0)
+ *flags |= HIST_FIELD_FL_HITCOUNT;
else {
field = trace_find_event_field(file->event_call, field_name);
if (!field || !field->size) {
@@ -3084,6 +3126,9 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
unsigned int i, j, var_idx;
u64 var_val;
+ /* Make sure stacktrace can fit in the string variable length */
+ BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX);
+
for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
struct field_var *field_var = field_vars[i];
struct hist_field *var = field_var->var;
@@ -3092,13 +3137,26 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
var_val = hist_fn_call(val, elt, buffer, rbe, rec);
var_idx = var->var.idx;
- if (val->flags & HIST_FIELD_FL_STRING) {
+ if (val->flags & (HIST_FIELD_FL_STRING |
+ HIST_FIELD_FL_STACKTRACE)) {
char *str = elt_data->field_var_str[j++];
char *val_str = (char *)(uintptr_t)var_val;
unsigned int size;
- size = min(val->size, STR_VAR_LEN_MAX);
- strscpy(str, val_str, size);
+ if (val->flags & HIST_FIELD_FL_STRING) {
+ size = min(val->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
+ } else {
+ char *stack_start = str + sizeof(unsigned long);
+ int e;
+
+ e = stack_trace_save((void *)stack_start,
+ HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
+ if (e < HIST_STACKTRACE_DEPTH - 1)
+ ((unsigned long *)stack_start)[e] = 0;
+ *((unsigned long *)str) = e;
+ }
var_val = (u64)(uintptr_t)str;
}
tracing_map_set_var(elt, var_idx, var_val);
@@ -3586,6 +3644,7 @@ static int parse_action_params(struct trace_array *tr, char *params,
while (params) {
if (data->n_params >= SYNTH_FIELDS_MAX) {
hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0);
+ ret = -EINVAL;
goto out;
}
@@ -3796,7 +3855,8 @@ static void save_field_var(struct hist_trigger_data *hist_data,
{
hist_data->field_vars[hist_data->n_field_vars++] = field_var;
- if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ /* Stack traces are saved in the string storage too */
+ if (field_var->val->flags & (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))
hist_data->n_field_var_str++;
}
@@ -3821,6 +3881,9 @@ static int check_synth_field(struct synth_event *event,
&& field->is_dynamic)
return 0;
+ if (strstr(hist_field->type, "long[") && field->is_stack)
+ return 0;
+
if (strcmp(field->type, hist_field->type) != 0) {
if (field->size != hist_field->size ||
(!field->is_string && field->is_signed != hist_field->is_signed))
@@ -3922,6 +3985,10 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
lockdep_assert_held(&event_mutex);
+ /* Sanity check to avoid out-of-bound write on 'data->var_ref_idx' */
+ if (data->n_params > SYNTH_FIELDS_MAX)
+ return -EINVAL;
+
if (data->use_trace_keyword)
synth_event_name = data->synth_event_name;
else
@@ -4071,7 +4138,8 @@ static int action_create(struct hist_trigger_data *hist_data,
}
hist_data->save_vars[hist_data->n_save_vars++] = field_var;
- if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ if (field_var->val->flags &
+ (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))
hist_data->n_save_var_str++;
kfree(param);
}
@@ -4210,6 +4278,19 @@ static u64 hist_field_execname(struct hist_field *hist_field,
return (u64)(unsigned long)(elt_data->comm);
}
+static u64 hist_field_stack(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ u32 str_item = *(u32 *)(event + hist_field->field->offset);
+ int str_loc = str_item & 0xffff;
+ char *addr = (char *)(event + str_loc);
+
+ return (u64)(unsigned long)addr;
+}
+
static u64 hist_fn_call(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
@@ -4273,6 +4354,8 @@ static u64 hist_fn_call(struct hist_field *hist_field,
return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_EXECNAME:
return hist_field_execname(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_STACK:
+ return hist_field_stack(hist_field, elt, buffer, rbe, event);
default:
return 0;
}
@@ -4319,7 +4402,8 @@ static int create_var_field(struct hist_trigger_data *hist_data,
if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME)
update_var_execname(hist_data->fields[val_idx]);
- if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING)
+ if (!ret && hist_data->fields[val_idx]->flags &
+ (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))
hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++;
return ret;
@@ -4328,8 +4412,8 @@ static int create_var_field(struct hist_trigger_data *hist_data,
static int create_val_fields(struct hist_trigger_data *hist_data,
struct trace_event_file *file)
{
+ unsigned int i, j = 1, n_hitcount = 0;
char *fields_str, *field_str;
- unsigned int i, j = 1;
int ret;
ret = create_hitcount_val(hist_data);
@@ -4346,8 +4430,10 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
if (!field_str)
break;
- if (strcmp(field_str, "hitcount") == 0)
- continue;
+ if (strcmp(field_str, "hitcount") == 0) {
+ if (!n_hitcount++)
+ continue;
+ }
ret = create_val_field(hist_data, j++, file, field_str);
if (ret)
@@ -4357,6 +4443,12 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
if (fields_str && (strcmp(fields_str, "hitcount") != 0))
ret = -EINVAL;
out:
+ /* There is only raw hitcount but nohitcount suppresses it. */
+ if (j == 1 && hist_data->attrs->no_hitcount) {
+ hist_err(hist_data->event_file->tr, HIST_ERR_NEED_NOHC_VAL, 0);
+ ret = -ENOENT;
+ }
+
return ret;
}
@@ -5052,7 +5144,8 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
if (hist_field->flags & HIST_FIELD_FL_VAR) {
var_idx = hist_field->var.idx;
- if (hist_field->flags & HIST_FIELD_FL_STRING) {
+ if (hist_field->flags &
+ (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) {
unsigned int str_start, var_str_idx, idx;
char *str, *val_str;
unsigned int size;
@@ -5065,9 +5158,20 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
str = elt_data->field_var_str[idx];
val_str = (char *)(uintptr_t)hist_val;
- size = min(hist_field->size, STR_VAR_LEN_MAX);
- strscpy(str, val_str, size);
-
+ if (hist_field->flags & HIST_FIELD_FL_STRING) {
+ size = min(hist_field->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
+ } else {
+ char *stack_start = str + sizeof(unsigned long);
+ int e;
+
+ e = stack_trace_save((void *)stack_start,
+ HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
+ if (e < HIST_STACKTRACE_DEPTH - 1)
+ ((unsigned long *)stack_start)[e] = 0;
+ *((unsigned long *)str) = e;
+ }
hist_val = (u64)(uintptr_t)str;
}
tracing_map_set_var(elt, var_idx, hist_val);
@@ -5153,8 +5257,17 @@ static void event_hist_trigger(struct event_trigger_data *data,
if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
memset(entries, 0, HIST_STACKTRACE_SIZE);
- stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
- HIST_STACKTRACE_SKIP);
+ if (key_field->field) {
+ unsigned long *stack, n_entries;
+
+ field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
+ stack = (unsigned long *)(long)field_contents;
+ n_entries = *stack;
+ memcpy(entries, ++stack, n_entries * sizeof(unsigned long));
+ } else {
+ stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
+ }
key = entries;
} else {
field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
@@ -5257,7 +5370,10 @@ static void hist_trigger_print_key(struct seq_file *m,
seq_printf(m, "%s: %-30s[%3llu]", field_name,
syscall_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
- seq_puts(m, "stacktrace:\n");
+ if (key_field->field)
+ seq_printf(m, "%s.stacktrace", key_field->field->name);
+ else
+ seq_puts(m, "stacktrace:\n");
hist_trigger_stacktrace_print(m,
key + key_field->offset,
HIST_STACKTRACE_DEPTH);
@@ -5285,33 +5401,101 @@ static void hist_trigger_print_key(struct seq_file *m,
seq_puts(m, "}");
}
+/* Get the 100 times of the percentage of @val in @total */
+static inline unsigned int __get_percentage(u64 val, u64 total)
+{
+ if (!total)
+ goto div0;
+
+ if (val < (U64_MAX / 10000))
+ return (unsigned int)div64_ul(val * 10000, total);
+
+ total = div64_u64(total, 10000);
+ if (!total)
+ goto div0;
+
+ return (unsigned int)div64_ul(val, total);
+div0:
+ return val ? UINT_MAX : 0;
+}
+
+#define BAR_CHAR '#'
+
+static inline const char *__fill_bar_str(char *buf, int size, u64 val, u64 max)
+{
+ unsigned int len = __get_percentage(val, max);
+ int i;
+
+ if (len == UINT_MAX) {
+ snprintf(buf, size, "[ERROR]");
+ return buf;
+ }
+
+ len = len * size / 10000;
+ for (i = 0; i < len && i < size; i++)
+ buf[i] = BAR_CHAR;
+ while (i < size)
+ buf[i++] = ' ';
+ buf[size] = '\0';
+
+ return buf;
+}
+
+struct hist_val_stat {
+ u64 max;
+ u64 total;
+};
+
+static void hist_trigger_print_val(struct seq_file *m, unsigned int idx,
+ const char *field_name, unsigned long flags,
+ struct hist_val_stat *stats,
+ struct tracing_map_elt *elt)
+{
+ u64 val = tracing_map_read_sum(elt, idx);
+ unsigned int pc;
+ char bar[21];
+
+ if (flags & HIST_FIELD_FL_PERCENT) {
+ pc = __get_percentage(val, stats[idx].total);
+ if (pc == UINT_MAX)
+ seq_printf(m, " %s (%%):[ERROR]", field_name);
+ else
+ seq_printf(m, " %s (%%): %3u.%02u", field_name,
+ pc / 100, pc % 100);
+ } else if (flags & HIST_FIELD_FL_GRAPH) {
+ seq_printf(m, " %s: %20s", field_name,
+ __fill_bar_str(bar, 20, val, stats[idx].max));
+ } else if (flags & HIST_FIELD_FL_HEX) {
+ seq_printf(m, " %s: %10llx", field_name, val);
+ } else {
+ seq_printf(m, " %s: %10llu", field_name, val);
+ }
+}
+
static void hist_trigger_entry_print(struct seq_file *m,
struct hist_trigger_data *hist_data,
+ struct hist_val_stat *stats,
void *key,
struct tracing_map_elt *elt)
{
const char *field_name;
- unsigned int i;
+ unsigned int i = HITCOUNT_IDX;
+ unsigned long flags;
hist_trigger_print_key(m, hist_data, key, elt);
- seq_printf(m, " hitcount: %10llu",
- tracing_map_read_sum(elt, HITCOUNT_IDX));
+ /* At first, show the raw hitcount if !nohitcount */
+ if (!hist_data->attrs->no_hitcount)
+ hist_trigger_print_val(m, i, "hitcount", 0, stats, elt);
for (i = 1; i < hist_data->n_vals; i++) {
field_name = hist_field_name(hist_data->fields[i], 0);
-
- if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
- hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
+ flags = hist_data->fields[i]->flags;
+ if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR)
continue;
- if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
- seq_printf(m, " %s: %10llx", field_name,
- tracing_map_read_sum(elt, i));
- } else {
- seq_printf(m, " %s: %10llu", field_name,
- tracing_map_read_sum(elt, i));
- }
+ seq_puts(m, " ");
+ hist_trigger_print_val(m, i, field_name, flags, stats, elt);
}
print_actions(m, hist_data, elt);
@@ -5324,7 +5508,9 @@ static int print_entries(struct seq_file *m,
{
struct tracing_map_sort_entry **sort_entries = NULL;
struct tracing_map *map = hist_data->map;
- int i, n_entries;
+ int i, j, n_entries;
+ struct hist_val_stat *stats = NULL;
+ u64 val;
n_entries = tracing_map_sort_entries(map, hist_data->sort_keys,
hist_data->n_sort_keys,
@@ -5332,11 +5518,34 @@ static int print_entries(struct seq_file *m,
if (n_entries < 0)
return n_entries;
+ /* Calculate the max and the total for each field if needed. */
+ for (j = 0; j < hist_data->n_vals; j++) {
+ if (!(hist_data->fields[j]->flags &
+ (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH)))
+ continue;
+ if (!stats) {
+ stats = kcalloc(hist_data->n_vals, sizeof(*stats),
+ GFP_KERNEL);
+ if (!stats) {
+ n_entries = -ENOMEM;
+ goto out;
+ }
+ }
+ for (i = 0; i < n_entries; i++) {
+ val = tracing_map_read_sum(sort_entries[i]->elt, j);
+ stats[j].total += val;
+ if (stats[j].max < val)
+ stats[j].max = val;
+ }
+ }
+
for (i = 0; i < n_entries; i++)
- hist_trigger_entry_print(m, hist_data,
+ hist_trigger_entry_print(m, hist_data, stats,
sort_entries[i]->key,
sort_entries[i]->elt);
+ kfree(stats);
+out:
tracing_map_destroy_sort_entries(sort_entries, n_entries);
return n_entries;
@@ -5709,7 +5918,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
if (hist_field->flags) {
if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
- !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
+ !(hist_field->flags & HIST_FIELD_FL_EXPR) &&
+ !(hist_field->flags & HIST_FIELD_FL_STACKTRACE)) {
const char *flags = get_hist_field_flags(hist_field);
if (flags)
@@ -5726,6 +5936,7 @@ static int event_hist_trigger_print(struct seq_file *m,
struct hist_trigger_data *hist_data = data->private_data;
struct hist_field *field;
bool have_var = false;
+ bool show_val = false;
unsigned int i;
seq_puts(m, HIST_PREFIX);
@@ -5741,9 +5952,12 @@ static int event_hist_trigger_print(struct seq_file *m,
if (i > hist_data->n_vals)
seq_puts(m, ",");
- if (field->flags & HIST_FIELD_FL_STACKTRACE)
- seq_puts(m, "stacktrace");
- else
+ if (field->flags & HIST_FIELD_FL_STACKTRACE) {
+ if (field->field)
+ seq_printf(m, "%s.stacktrace", field->field->name);
+ else
+ seq_puts(m, "stacktrace");
+ } else
hist_field_print(m, field);
}
@@ -5756,12 +5970,16 @@ static int event_hist_trigger_print(struct seq_file *m,
continue;
}
- if (i == HITCOUNT_IDX)
+ if (i == HITCOUNT_IDX) {
+ if (hist_data->attrs->no_hitcount)
+ continue;
seq_puts(m, "hitcount");
- else {
- seq_puts(m, ",");
+ } else {
+ if (show_val)
+ seq_puts(m, ",");
hist_field_print(m, field);
}
+ show_val = true;
}
if (have_var) {
@@ -5812,6 +6030,8 @@ static int event_hist_trigger_print(struct seq_file *m,
seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
if (hist_data->enable_timestamps)
seq_printf(m, ":clock=%s", hist_data->attrs->clock);
+ if (hist_data->attrs->no_hitcount)
+ seq_puts(m, ":nohitcount");
print_actions_spec(m, hist_data);
@@ -6438,7 +6658,7 @@ enable:
if (se)
se->ref++;
out:
- if (ret == 0)
+ if (ret == 0 && glob[0])
hist_err_clear();
return ret;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index c3b582d19b62..70bddb25d9c0 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -173,6 +173,14 @@ static int synth_field_is_string(char *type)
return false;
}
+static int synth_field_is_stack(char *type)
+{
+ if (strstr(type, "long[") != NULL)
+ return true;
+
+ return false;
+}
+
static int synth_field_string_size(char *type)
{
char buf[4], *end, *start;
@@ -248,6 +256,8 @@ static int synth_field_size(char *type)
size = sizeof(gfp_t);
else if (synth_field_is_string(type))
size = synth_field_string_size(type);
+ else if (synth_field_is_stack(type))
+ size = 0;
return size;
}
@@ -292,6 +302,8 @@ static const char *synth_field_fmt(char *type)
fmt = "%x";
else if (synth_field_is_string(type))
fmt = "%.*s";
+ else if (synth_field_is_stack(type))
+ fmt = "%s";
return fmt;
}
@@ -371,6 +383,23 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
i == se->n_fields - 1 ? "" : " ");
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
}
+ } else if (se->fields[i]->is_stack) {
+ u32 offset, data_offset, len;
+ unsigned long *p, *end;
+
+ offset = (u32)entry->fields[n_u64];
+ data_offset = offset & 0xffff;
+ len = offset >> 16;
+
+ p = (void *)entry + data_offset;
+ end = (void *)p + len - (sizeof(long) - 1);
+
+ trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name);
+
+ for (; *p && p < end; p++)
+ trace_seq_printf(s, "=> %pS\n", (void *)*p);
+ n_u64++;
+
} else {
struct trace_print_flags __flags[] = {
__def_gfpflag_names, {-1, NULL} };
@@ -416,8 +445,7 @@ static unsigned int trace_string(struct synth_trace_event *entry,
if (is_dynamic) {
u32 data_offset;
- data_offset = offsetof(typeof(*entry), fields);
- data_offset += event->n_u64 * sizeof(u64);
+ data_offset = struct_size(entry, fields, event->n_u64);
data_offset += data_size;
len = kern_fetch_store_strlen((unsigned long)str_val);
@@ -447,6 +475,43 @@ static unsigned int trace_string(struct synth_trace_event *entry,
return len;
}
+static unsigned int trace_stack(struct synth_trace_event *entry,
+ struct synth_event *event,
+ long *stack,
+ unsigned int data_size,
+ unsigned int *n_u64)
+{
+ unsigned int len;
+ u32 data_offset;
+ void *data_loc;
+
+ data_offset = struct_size(entry, fields, event->n_u64);
+ data_offset += data_size;
+
+ for (len = 0; len < HIST_STACKTRACE_DEPTH; len++) {
+ if (!stack[len])
+ break;
+ }
+
+ /* Include the zero'd element if it fits */
+ if (len < HIST_STACKTRACE_DEPTH)
+ len++;
+
+ len *= sizeof(long);
+
+ /* Find the dynamic section to copy the stack into. */
+ data_loc = (void *)entry + data_offset;
+ memcpy(data_loc, stack, len);
+
+ /* Fill in the field that holds the offset/len combo */
+ data_offset |= len << 16;
+ *(u32 *)&entry->fields[*n_u64] = data_offset;
+
+ (*n_u64)++;
+
+ return len;
+}
+
static notrace void trace_event_raw_event_synth(void *__data,
u64 *var_ref_vals,
unsigned int *var_ref_idx)
@@ -473,7 +538,12 @@ static notrace void trace_event_raw_event_synth(void *__data,
val_idx = var_ref_idx[field_pos];
str_val = (char *)(long)var_ref_vals[val_idx];
- len = kern_fetch_store_strlen((unsigned long)str_val);
+ if (event->dynamic_fields[i]->is_stack) {
+ len = *((unsigned long *)str_val);
+ len *= sizeof(unsigned long);
+ } else {
+ len = kern_fetch_store_strlen((unsigned long)str_val);
+ }
fields_size += len;
}
@@ -499,6 +569,12 @@ static notrace void trace_event_raw_event_synth(void *__data,
event->fields[i]->is_dynamic,
data_size, &n_u64);
data_size += len; /* only dynamic string increments */
+ } else if (event->fields[i]->is_stack) {
+ long *stack = (long *)(long)var_ref_vals[val_idx];
+
+ len = trace_stack(entry, event, stack,
+ data_size, &n_u64);
+ data_size += len;
} else {
struct synth_field *field = event->fields[i];
u64 val = var_ref_vals[val_idx];
@@ -561,6 +637,9 @@ static int __set_synth_event_print_fmt(struct synth_event *event,
event->fields[i]->is_dynamic)
pos += snprintf(buf + pos, LEN_OR_ZERO,
", __get_str(%s)", event->fields[i]->name);
+ else if (event->fields[i]->is_stack)
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", __get_stacktrace(%s)", event->fields[i]->name);
else
pos += snprintf(buf + pos, LEN_OR_ZERO,
", REC->%s", event->fields[i]->name);
@@ -697,7 +776,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
ret = -EINVAL;
goto free;
} else if (size == 0) {
- if (synth_field_is_string(field->type)) {
+ if (synth_field_is_string(field->type) ||
+ synth_field_is_stack(field->type)) {
char *type;
len = sizeof("__data_loc ") + strlen(field->type) + 1;
@@ -728,6 +808,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
if (synth_field_is_string(field->type))
field->is_string = true;
+ else if (synth_field_is_stack(field->type))
+ field->is_stack = true;
field->is_signed = synth_field_signed(field->type);
out:
@@ -1282,12 +1364,12 @@ static int __create_synth_event(const char *name, const char *raw_fields)
goto err_free_arg;
}
- fields[n_fields++] = field;
if (n_fields == SYNTH_FIELDS_MAX) {
synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
ret = -EINVAL;
goto err_free_arg;
}
+ fields[n_fields++] = field;
n_fields_this_loop++;
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 918730d74932..e535959939d3 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1067,7 +1067,14 @@ int set_trigger_filter(char *filter_str,
/* The filter is for the 'trigger' event, not the triggered event */
ret = create_event_filter(file->tr, file->event_call,
- filter_str, false, &filter);
+ filter_str, true, &filter);
+
+ /* Only enabled set_str for error handling */
+ if (filter) {
+ kfree(filter->filter_string);
+ filter->filter_string = NULL;
+ }
+
/*
* If create_event_filter() fails, filter still needs to be freed.
* Which the calling code will do with data->filter.
@@ -1078,8 +1085,14 @@ int set_trigger_filter(char *filter_str,
rcu_assign_pointer(data->filter, filter);
if (tmp) {
- /* Make sure the call is done with the filter */
- tracepoint_synchronize_unregister();
+ /*
+ * Make sure the call is done with the filter.
+ * It is possible that a filter could fail at boot up,
+ * and then this path will be called. Avoid the synchronization
+ * in that case.
+ */
+ if (system_state != SYSTEM_BOOTING)
+ tracepoint_synchronize_unregister();
free_event_filter(tmp);
}
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 539b08ae7020..908e8a13c675 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1359,6 +1359,7 @@ put_user_lock:
put_user:
user_event_destroy_fields(user);
user_event_destroy_validators(user);
+ kfree(user->call.print_fmt);
kfree(user);
return ret;
}
@@ -1488,7 +1489,7 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf,
if (unlikely(*ppos != 0))
return -EFAULT;
- if (unlikely(import_single_range(WRITE, (char __user *)ubuf,
+ if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
count, &iov, &i)))
return -EFAULT;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d960f6b11b5e..58f3946081e2 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -111,7 +111,8 @@ static void __always_unused ____ftrace_check_##name(void) \
#define __array(_type, _item, _len) { \
.type = #_type"["__stringify(_len)"]", .name = #_item, \
.size = sizeof(_type[_len]), .align = __alignof__(_type), \
- is_signed_type(_type), .filter_type = FILTER_OTHER },
+ is_signed_type(_type), .filter_type = FILTER_OTHER, \
+ .len = _len },
#undef __array_desc
#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5a75b039e586..ee77c8203bd5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1344,7 +1344,6 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
return;
fbuffer.regs = regs;
- entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->ip = (unsigned long)tk->rp.kp.addr;
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
@@ -1385,7 +1384,6 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
return;
fbuffer.regs = regs;
- entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = get_kretprobe_retaddr(ri);
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 4300c5dc4e5d..04f0fdae19a1 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -49,6 +49,28 @@
#define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */
/*
+ * osnoise/options entries.
+ */
+enum osnoise_options_index {
+ OSN_DEFAULTS = 0,
+ OSN_WORKLOAD,
+ OSN_PANIC_ON_STOP,
+ OSN_PREEMPT_DISABLE,
+ OSN_IRQ_DISABLE,
+ OSN_MAX
+};
+
+static const char * const osnoise_options_str[OSN_MAX] = {
+ "DEFAULTS",
+ "OSNOISE_WORKLOAD",
+ "PANIC_ON_STOP",
+ "OSNOISE_PREEMPT_DISABLE",
+ "OSNOISE_IRQ_DISABLE" };
+
+#define OSN_DEFAULT_OPTIONS 0x2
+static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS;
+
+/*
* trace_array of the enabled osnoise/timerlat instances.
*/
struct osnoise_instance {
@@ -125,9 +147,8 @@ static void osnoise_unregister_instance(struct trace_array *tr)
* register/unregister serialization is provided by trace's
* trace_types_lock.
*/
- lockdep_assert_held(&trace_types_lock);
-
- list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ list_for_each_entry_rcu(inst, &osnoise_instances, list,
+ lockdep_is_held(&trace_types_lock)) {
if (inst->tr == tr) {
list_del_rcu(&inst->list);
found = 1;
@@ -1173,11 +1194,12 @@ trace_sched_switch_callback(void *data, bool preempt,
unsigned int prev_state)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
+ int workload = test_bit(OSN_WORKLOAD, &osnoise_options);
- if (p->pid != osn_var->pid)
+ if ((p->pid != osn_var->pid) || !workload)
thread_exit(osn_var, p);
- if (n->pid != osn_var->pid)
+ if ((n->pid != osn_var->pid) || !workload)
thread_entry(osn_var, n);
}
@@ -1255,6 +1277,9 @@ static __always_inline void osnoise_stop_tracing(void)
trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
"stop tracing hit on cpu %d\n", smp_processor_id());
+ if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
+ panic("tracer hit stop condition on CPU %d\n", smp_processor_id());
+
tracer_tracing_off(tr);
}
rcu_read_unlock();
@@ -1289,12 +1314,14 @@ static void notify_new_max_latency(u64 latency)
*/
static int run_osnoise(void)
{
+ bool disable_irq = test_bit(OSN_IRQ_DISABLE, &osnoise_options);
struct osnoise_variables *osn_var = this_cpu_osn_var();
u64 start, sample, last_sample;
u64 last_int_count, int_count;
s64 noise = 0, max_noise = 0;
s64 total, last_total = 0;
struct osnoise_sample s;
+ bool disable_preemption;
unsigned int threshold;
u64 runtime, stop_in;
u64 sum_noise = 0;
@@ -1302,6 +1329,12 @@ static int run_osnoise(void)
int ret = -1;
/*
+ * Disabling preemption is only required if IRQs are enabled,
+ * and the options is set on.
+ */
+ disable_preemption = !disable_irq && test_bit(OSN_PREEMPT_DISABLE, &osnoise_options);
+
+ /*
* Considers the current thread as the workload.
*/
osn_var->pid = current->pid;
@@ -1317,6 +1350,15 @@ static int run_osnoise(void)
threshold = tracing_thresh ? : 5000;
/*
+ * Apply PREEMPT and IRQ disabled options.
+ */
+ if (disable_irq)
+ local_irq_disable();
+
+ if (disable_preemption)
+ preempt_disable();
+
+ /*
* Make sure NMIs see sampling first
*/
osn_var->sampling = true;
@@ -1403,16 +1445,21 @@ static int run_osnoise(void)
* cond_resched()
*/
if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
- local_irq_disable();
+ if (!disable_irq)
+ local_irq_disable();
+
rcu_momentary_dyntick_idle();
- local_irq_enable();
+
+ if (!disable_irq)
+ local_irq_enable();
}
/*
* For the non-preemptive kernel config: let threads runs, if
- * they so wish.
+ * they so wish, unless set not do to so.
*/
- cond_resched();
+ if (!disable_irq && !disable_preemption)
+ cond_resched();
last_sample = sample;
last_int_count = int_count;
@@ -1432,6 +1479,15 @@ static int run_osnoise(void)
barrier();
/*
+ * Return to the preemptive state.
+ */
+ if (disable_preemption)
+ preempt_enable();
+
+ if (disable_irq)
+ local_irq_enable();
+
+ /*
* Save noise info.
*/
s.noise = time_to_us(sum_noise);
@@ -1483,7 +1539,7 @@ static void osnoise_sleep(void)
wake_time = ktime_add_us(ktime_get(), interval);
__set_current_state(TASK_INTERRUPTIBLE);
- while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) {
+ while (schedule_hrtimeout(&wake_time, HRTIMER_MODE_ABS)) {
if (kthread_should_stop())
break;
}
@@ -1710,9 +1766,16 @@ static void stop_kthread(unsigned int cpu)
struct task_struct *kthread;
kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
- if (kthread)
+ if (kthread) {
kthread_stop(kthread);
- per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+ per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+ } else {
+ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ per_cpu(per_cpu_osnoise_var, cpu).sampling = false;
+ barrier();
+ return;
+ }
+ }
}
/*
@@ -1746,6 +1809,13 @@ static int start_kthread(unsigned int cpu)
snprintf(comm, 24, "timerlat/%d", cpu);
main = timerlat_main;
} else {
+ /* if no workload, just return */
+ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ per_cpu(per_cpu_osnoise_var, cpu).sampling = true;
+ barrier();
+ return 0;
+ }
+
snprintf(comm, 24, "osnoise/%d", cpu);
}
@@ -1861,6 +1931,150 @@ static void osnoise_init_hotplug_support(void)
#endif /* CONFIG_HOTPLUG_CPU */
/*
+ * seq file functions for the osnoise/options file.
+ */
+static void *s_options_start(struct seq_file *s, loff_t *pos)
+{
+ int option = *pos;
+
+ mutex_lock(&interface_lock);
+
+ if (option >= OSN_MAX)
+ return NULL;
+
+ return pos;
+}
+
+static void *s_options_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ int option = ++(*pos);
+
+ if (option >= OSN_MAX)
+ return NULL;
+
+ return pos;
+}
+
+static int s_options_show(struct seq_file *s, void *v)
+{
+ loff_t *pos = v;
+ int option = *pos;
+
+ if (option == OSN_DEFAULTS) {
+ if (osnoise_options == OSN_DEFAULT_OPTIONS)
+ seq_printf(s, "%s", osnoise_options_str[option]);
+ else
+ seq_printf(s, "NO_%s", osnoise_options_str[option]);
+ goto out;
+ }
+
+ if (test_bit(option, &osnoise_options))
+ seq_printf(s, "%s", osnoise_options_str[option]);
+ else
+ seq_printf(s, "NO_%s", osnoise_options_str[option]);
+
+out:
+ if (option != OSN_MAX)
+ seq_puts(s, " ");
+
+ return 0;
+}
+
+static void s_options_stop(struct seq_file *s, void *v)
+{
+ seq_puts(s, "\n");
+ mutex_unlock(&interface_lock);
+}
+
+static const struct seq_operations osnoise_options_seq_ops = {
+ .start = s_options_start,
+ .next = s_options_next,
+ .show = s_options_show,
+ .stop = s_options_stop
+};
+
+static int osnoise_options_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &osnoise_options_seq_ops);
+};
+
+/**
+ * osnoise_options_write - Write function for "options" entry
+ * @filp: The active open file structure
+ * @ubuf: The user buffer that contains the value to write
+ * @cnt: The maximum number of bytes to write to "file"
+ * @ppos: The current position in @file
+ *
+ * Writing the option name sets the option, writing the "NO_"
+ * prefix in front of the option name disables it.
+ *
+ * Writing "DEFAULTS" resets the option values to the default ones.
+ */
+static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int running, option, enable, retval;
+ char buf[256], *option_str;
+
+ if (cnt >= 256)
+ return -EINVAL;
+
+ if (copy_from_user(buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ if (strncmp(buf, "NO_", 3)) {
+ option_str = strstrip(buf);
+ enable = true;
+ } else {
+ option_str = strstrip(&buf[3]);
+ enable = false;
+ }
+
+ option = match_string(osnoise_options_str, OSN_MAX, option_str);
+ if (option < 0)
+ return -EINVAL;
+
+ /*
+ * trace_types_lock is taken to avoid concurrency on start/stop.
+ */
+ mutex_lock(&trace_types_lock);
+ running = osnoise_has_registered_instances();
+ if (running)
+ stop_per_cpu_kthreads();
+
+ mutex_lock(&interface_lock);
+ /*
+ * avoid CPU hotplug operations that might read options.
+ */
+ cpus_read_lock();
+
+ retval = cnt;
+
+ if (enable) {
+ if (option == OSN_DEFAULTS)
+ osnoise_options = OSN_DEFAULT_OPTIONS;
+ else
+ set_bit(option, &osnoise_options);
+ } else {
+ if (option == OSN_DEFAULTS)
+ retval = -EINVAL;
+ else
+ clear_bit(option, &osnoise_options);
+ }
+
+ cpus_read_unlock();
+ mutex_unlock(&interface_lock);
+
+ if (running)
+ start_per_cpu_kthreads();
+ mutex_unlock(&trace_types_lock);
+
+ return retval;
+}
+
+/*
* osnoise_cpus_read - Read function for reading the "cpus" file
* @filp: The active open file structure
* @ubuf: The userspace provided buffer to read value into
@@ -2042,6 +2256,14 @@ static const struct file_operations cpus_fops = {
.llseek = generic_file_llseek,
};
+static const struct file_operations osnoise_options_fops = {
+ .open = osnoise_options_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = osnoise_options_write
+};
+
#ifdef CONFIG_TIMERLAT_TRACER
#ifdef CONFIG_STACKTRACE
static int init_timerlat_stack_tracefs(struct dentry *top_dir)
@@ -2128,6 +2350,11 @@ static int init_tracefs(void)
if (!tmp)
goto err;
+ tmp = trace_create_file("options", TRACE_MODE_WRITE, top_dir, NULL,
+ &osnoise_options_fops);
+ if (!tmp)
+ goto err;
+
ret = init_timerlat_tracefs(top_dir);
if (ret)
goto err;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 67f47ea27921..bd475a00f96d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -11,6 +11,7 @@
#include <linux/kprobes.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
+#include <linux/idr.h>
#include "trace_output.h"
@@ -21,8 +22,6 @@ DECLARE_RWSEM(trace_event_sem);
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
-static int next_event_type = __TRACE_LAST_TYPE;
-
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -323,8 +322,9 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
}
EXPORT_SYMBOL(trace_event_printf);
-static int trace_output_raw(struct trace_iterator *iter, char *name,
- char *fmt, va_list ap)
+static __printf(3, 0)
+int trace_output_raw(struct trace_iterator *iter, char *name,
+ char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
@@ -688,38 +688,23 @@ struct trace_event *ftrace_find_event(int type)
return NULL;
}
-static LIST_HEAD(ftrace_event_list);
+static DEFINE_IDA(trace_event_ida);
-static int trace_search_list(struct list_head **list)
+static void free_trace_event_type(int type)
{
- struct trace_event *e = NULL, *iter;
- int next = __TRACE_LAST_TYPE;
-
- if (list_empty(&ftrace_event_list)) {
- *list = &ftrace_event_list;
- return next;
- }
+ if (type >= __TRACE_LAST_TYPE)
+ ida_free(&trace_event_ida, type);
+}
- /*
- * We used up all possible max events,
- * lets see if somebody freed one.
- */
- list_for_each_entry(iter, &ftrace_event_list, list) {
- if (iter->type != next) {
- e = iter;
- break;
- }
- next++;
- }
+static int alloc_trace_event_type(void)
+{
+ int next;
- /* Did we used up all 65 thousand events??? */
- if (next > TRACE_EVENT_TYPE_MAX)
+ /* Skip static defined type numbers */
+ next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE,
+ TRACE_EVENT_TYPE_MAX, GFP_KERNEL);
+ if (next < 0)
return 0;
-
- if (e)
- *list = &e->list;
- else
- *list = &ftrace_event_list;
return next;
}
@@ -761,28 +746,10 @@ int register_trace_event(struct trace_event *event)
if (WARN_ON(!event->funcs))
goto out;
- INIT_LIST_HEAD(&event->list);
-
if (!event->type) {
- struct list_head *list = NULL;
-
- if (next_event_type > TRACE_EVENT_TYPE_MAX) {
-
- event->type = trace_search_list(&list);
- if (!event->type)
- goto out;
-
- } else {
-
- event->type = next_event_type++;
- list = &ftrace_event_list;
- }
-
- if (WARN_ON(ftrace_find_event(event->type)))
+ event->type = alloc_trace_event_type();
+ if (!event->type)
goto out;
-
- list_add_tail(&event->list, list);
-
} else if (WARN(event->type > __TRACE_LAST_TYPE,
"Need to add type to trace.h")) {
goto out;
@@ -819,7 +786,7 @@ EXPORT_SYMBOL_GPL(register_trace_event);
int __unregister_trace_event(struct trace_event *event)
{
hlist_del(&event->node);
- list_del(&event->list);
+ free_trace_event_type(event->type);
return 0;
}
@@ -1568,7 +1535,7 @@ static struct trace_event *events[] __initdata = {
NULL
};
-__init static int init_events(void)
+__init int init_events(void)
{
struct trace_event *event;
int i, ret;
@@ -1581,4 +1548,3 @@ __init static int init_events(void)
return 0;
}
-early_initcall(init_events);
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 1e130da1b742..e37446f7916e 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -15,6 +15,20 @@
#define CREATE_TRACE_POINTS
#include <trace/events/preemptirq.h>
+/*
+ * Use regular trace points on architectures that implement noinstr
+ * tooling: these calls will only happen with RCU enabled, which can
+ * use a regular tracepoint.
+ *
+ * On older architectures, use the rcuidle tracing methods (which
+ * aren't NMI-safe - so exclude NMI contexts):
+ */
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+#define trace(point) trace_##point
+#else
+#define trace(point) if (!in_nmi()) trace_##point##_rcuidle
+#endif
+
#ifdef CONFIG_TRACE_IRQFLAGS
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);
@@ -28,8 +42,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
void trace_hardirqs_on_prepare(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -40,8 +53,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
void trace_hardirqs_on(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -63,8 +75,7 @@ void trace_hardirqs_off_finish(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- if (!in_nmi())
- trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
}
}
@@ -78,56 +89,24 @@ void trace_hardirqs_off(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- if (!in_nmi())
- trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
}
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);
-
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
-{
- if (this_cpu_read(tracing_irq_cpu)) {
- if (!in_nmi())
- trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
- tracer_hardirqs_on(CALLER_ADDR0, caller_addr);
- this_cpu_write(tracing_irq_cpu, 0);
- }
-
- lockdep_hardirqs_on_prepare();
- lockdep_hardirqs_on(caller_addr);
-}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-NOKPROBE_SYMBOL(trace_hardirqs_on_caller);
-
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
-{
- lockdep_hardirqs_off(caller_addr);
-
- if (!this_cpu_read(tracing_irq_cpu)) {
- this_cpu_write(tracing_irq_cpu, 1);
- tracer_hardirqs_off(CALLER_ADDR0, caller_addr);
- if (!in_nmi())
- trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
- }
-}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
-NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_TRACE_IRQFLAGS */
#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
- if (!in_nmi())
- trace_preempt_enable_rcuidle(a0, a1);
+ trace(preempt_enable)(a0, a1);
tracer_preempt_on(a0, a1);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
- if (!in_nmi())
- trace_preempt_disable_rcuidle(a0, a1);
+ trace(preempt_disable)(a0, a1);
tracer_preempt_off(a0, a1);
}
#endif
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 36dff277de46..11008c098727 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -50,6 +50,7 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx")
+DEFINE_BASIC_PRINT_TYPE_FUNC(char, u8, "'%c'")
int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent)
{
@@ -76,9 +77,11 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
/* Fetch type information table */
static const struct fetch_type probe_fetch_types[] = {
/* Special types */
- __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1,
+ __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, 1,
"__data_loc char[]"),
- __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1,
+ __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, 1,
+ "__data_loc char[]"),
+ __ASSIGN_FETCH_TYPE("symstr", string, string, sizeof(u32), 1, 1,
"__data_loc char[]"),
/* Basic types */
ASSIGN_FETCH_TYPE(u8, u8, 0),
@@ -93,15 +96,21 @@ static const struct fetch_type probe_fetch_types[] = {
ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),
ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),
ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0),
+ ASSIGN_FETCH_TYPE_ALIAS(char, u8, u8, 0),
ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0),
ASSIGN_FETCH_TYPE_END
};
-static const struct fetch_type *find_fetch_type(const char *type)
+static const struct fetch_type *find_fetch_type(const char *type, unsigned long flags)
{
int i;
+ /* Reject the symbol/symstr for uprobes */
+ if (type && (flags & TPARG_FL_USER) &&
+ (!strcmp(type, "symbol") || !strcmp(type, "symstr")))
+ return NULL;
+
if (!type)
type = DEFAULT_FETCH_TYPE_STR;
@@ -119,13 +128,13 @@ static const struct fetch_type *find_fetch_type(const char *type)
switch (bs) {
case 8:
- return find_fetch_type("u8");
+ return find_fetch_type("u8", flags);
case 16:
- return find_fetch_type("u16");
+ return find_fetch_type("u16", flags);
case 32:
- return find_fetch_type("u32");
+ return find_fetch_type("u32", flags);
case 64:
- return find_fetch_type("u64");
+ return find_fetch_type("u64", flags);
default:
goto fail;
}
@@ -246,7 +255,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
return -EINVAL;
}
strlcpy(buf, event, slash - event + 1);
- if (!is_good_name(buf)) {
+ if (!is_good_system_name(buf)) {
trace_probe_log_err(offset, BAD_GROUP_NAME);
return -EINVAL;
}
@@ -478,7 +487,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
DEREF_OPEN_BRACE);
return -EINVAL;
} else {
- const struct fetch_type *t2 = find_fetch_type(NULL);
+ const struct fetch_type *t2 = find_fetch_type(NULL, flags);
*tmp = '\0';
ret = parse_probe_arg(arg, t2, &code, end, flags, offs);
@@ -630,9 +639,9 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
/* The type of $comm must be "string", and not an array. */
if (parg->count || (t && strcmp(t, "string")))
goto out;
- parg->type = find_fetch_type("string");
+ parg->type = find_fetch_type("string", flags);
} else
- parg->type = find_fetch_type(t);
+ parg->type = find_fetch_type(t, flags);
if (!parg->type) {
trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
goto out;
@@ -662,16 +671,26 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
ret = -EINVAL;
/* Store operation */
- if (!strcmp(parg->type->name, "string") ||
- !strcmp(parg->type->name, "ustring")) {
- if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
- code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
- code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
- trace_probe_log_err(offset + (t ? (t - arg) : 0),
- BAD_STRING);
- goto fail;
+ if (parg->type->is_string) {
+ if (!strcmp(parg->type->name, "symstr")) {
+ if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK &&
+ code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG &&
+ code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) {
+ trace_probe_log_err(offset + (t ? (t - arg) : 0),
+ BAD_SYMSTRING);
+ goto fail;
+ }
+ } else {
+ if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
+ code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
+ code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
+ trace_probe_log_err(offset + (t ? (t - arg) : 0),
+ BAD_STRING);
+ goto fail;
+ }
}
- if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
+ if (!strcmp(parg->type->name, "symstr") ||
+ (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG ||
parg->count) {
/*
@@ -679,6 +698,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
* must be kept, and if parg->count != 0, this is an
* array of string pointers instead of string address
* itself.
+ * For the symstr, it doesn't need to dereference, thus
+ * it just get the value.
*/
code++;
if (code->op != FETCH_OP_NOP) {
@@ -690,6 +711,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
if (!strcmp(parg->type->name, "ustring") ||
code->op == FETCH_OP_UDEREF)
code->op = FETCH_OP_ST_USTRING;
+ else if (!strcmp(parg->type->name, "symstr"))
+ code->op = FETCH_OP_ST_SYMSTR;
else
code->op = FETCH_OP_ST_STRING;
code->size = parg->type->size;
@@ -919,8 +942,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
for (i = 0; i < tp->nr_args; i++) {
parg = tp->args + i;
if (parg->count) {
- if ((strcmp(parg->type->name, "string") == 0) ||
- (strcmp(parg->type->name, "ustring") == 0))
+ if (parg->type->is_string)
fmt = ", __get_str(%s[%d])";
else
fmt = ", REC->%s[%d]";
@@ -928,8 +950,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
pos += snprintf(buf + pos, LEN_OR_ZERO,
fmt, parg->name, j);
} else {
- if ((strcmp(parg->type->name, "string") == 0) ||
- (strcmp(parg->type->name, "ustring") == 0))
+ if (parg->type->is_string)
fmt = ", __get_str(%s)";
else
fmt = ", REC->%s";
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index de38f1c03776..b4f99553411e 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -98,6 +98,7 @@ enum fetch_op {
FETCH_OP_ST_UMEM, /* Mem: .offset, .size */
FETCH_OP_ST_STRING, /* String: .offset, .size */
FETCH_OP_ST_USTRING, /* User String: .offset, .size */
+ FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */
// Stage 4 (modify) op
FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */
// Stage 5 (loop) op
@@ -133,7 +134,8 @@ struct fetch_insn {
struct fetch_type {
const char *name; /* Name of type */
size_t size; /* Byte size of type */
- int is_signed; /* Signed flag */
+ bool is_signed; /* Signed flag */
+ bool is_string; /* String flag */
print_type_func_t print; /* Print functions */
const char *fmt; /* Format string */
const char *fmttype; /* Name in format file */
@@ -164,6 +166,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x16);
DECLARE_BASIC_PRINT_TYPE_FUNC(x32);
DECLARE_BASIC_PRINT_TYPE_FUNC(x64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(char);
DECLARE_BASIC_PRINT_TYPE_FUNC(string);
DECLARE_BASIC_PRINT_TYPE_FUNC(symbol);
@@ -177,16 +180,19 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(symbol);
#define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t)
#define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG)
-#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
- {.name = _name, \
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, str, _fmttype) \
+ {.name = _name, \
.size = _size, \
- .is_signed = sign, \
+ .is_signed = (bool)sign, \
+ .is_string = (bool)str, \
.print = PRINT_TYPE_FUNC_NAME(ptype), \
.fmt = PRINT_TYPE_FMT_NAME(ptype), \
.fmttype = _fmttype, \
}
+
+/* Non string types can use these macros */
#define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
- __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype)
+ __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, 0, #_fmttype)
#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
_ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype)
@@ -353,7 +359,8 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char
#define TPARG_FL_KERNEL BIT(1)
#define TPARG_FL_FENTRY BIT(2)
#define TPARG_FL_TPOINT BIT(3)
-#define TPARG_FL_MASK GENMASK(3, 0)
+#define TPARG_FL_USER BIT(4)
+#define TPARG_FL_MASK GENMASK(4, 0)
extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
const char *argv, unsigned int flags);
@@ -431,6 +438,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(ARRAY_TOO_BIG, "Array number is too big"), \
C(BAD_TYPE, "Unknown type is specified"), \
C(BAD_STRING, "String accepts only memory argument"), \
+ C(BAD_SYMSTRING, "Symbol String doesn't accept data/userdata"), \
C(BAD_BITFIELD, "Invalid bitfield"), \
C(ARG_NAME_TOO_LONG, "Argument name is too long"), \
C(NO_ARG_NAME, "Argument name is not specified"), \
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index b3bdb8ddb862..5cea672243f6 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -67,6 +67,37 @@ probe_mem_read(void *dest, void *src, size_t size);
static nokprobe_inline int
probe_mem_read_user(void *dest, void *src, size_t size);
+static nokprobe_inline int
+fetch_store_symstrlen(unsigned long addr)
+{
+ char namebuf[KSYM_SYMBOL_LEN];
+ int ret;
+
+ ret = sprint_symbol(namebuf, addr);
+ if (ret < 0)
+ return 0;
+
+ return ret + 1;
+}
+
+/*
+ * Fetch a null-terminated symbol string + offset. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_symstring(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ return sprint_symbol(__dest, addr);
+}
+
/* From the 2nd stage, routine is same */
static nokprobe_inline int
process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val,
@@ -99,16 +130,22 @@ stage2:
stage3:
/* 3rd stage: store value to buffer */
if (unlikely(!dest)) {
- if (code->op == FETCH_OP_ST_STRING) {
+ switch (code->op) {
+ case FETCH_OP_ST_STRING:
ret = fetch_store_strlen(val + code->offset);
code++;
goto array;
- } else if (code->op == FETCH_OP_ST_USTRING) {
+ case FETCH_OP_ST_USTRING:
ret += fetch_store_strlen_user(val + code->offset);
code++;
goto array;
- } else
+ case FETCH_OP_ST_SYMSTR:
+ ret += fetch_store_symstrlen(val + code->offset);
+ code++;
+ goto array;
+ default:
return -EILSEQ;
+ }
}
switch (code->op) {
@@ -129,6 +166,10 @@ stage3:
loc = *(u32 *)dest;
ret = fetch_store_string_user(val + code->offset, dest, base);
break;
+ case FETCH_OP_ST_SYMSTR:
+ loc = *(u32 *)dest;
+ ret = fetch_store_symstring(val + code->offset, dest, base);
+ break;
default:
return -EILSEQ;
}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a2d301f58ced..ff0536cea968 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -785,7 +785,14 @@ static struct fgraph_ops fgraph_ops __initdata = {
};
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-noinline __noclone static void trace_direct_tramp(void) { }
+#ifndef CALL_DEPTH_ACCOUNT
+#define CALL_DEPTH_ACCOUNT ""
+#endif
+
+noinline __noclone static void trace_direct_tramp(void)
+{
+ asm(CALL_DEPTH_ACCOUNT);
+}
#endif
/*
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 9c90b3a7dce2..e5e299260d0c 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -403,3 +403,26 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str,
return 1;
}
EXPORT_SYMBOL(trace_seq_hex_dump);
+
+/*
+ * trace_seq_acquire - acquire seq buffer with size len
+ * @s: trace sequence descriptor
+ * @len: size of buffer to be acquired
+ *
+ * acquire buffer with size of @len from trace_seq for output usage,
+ * user can fill string into that buffer.
+ *
+ * Returns start address of acquired buffer.
+ *
+ * it allow multiple usage in one trace output function call.
+ */
+char *trace_seq_acquire(struct trace_seq *s, unsigned int len)
+{
+ char *ret = trace_seq_buffer_ptr(s);
+
+ if (!WARN_ON_ONCE(seq_buf_buffer_left(&s->seq) < len))
+ seq_buf_commit(&s->seq, len);
+
+ return ret;
+}
+EXPORT_SYMBOL(trace_seq_acquire);
diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h
index b29595fe3ac5..43f6fb6078db 100644
--- a/kernel/trace/trace_synth.h
+++ b/kernel/trace/trace_synth.h
@@ -18,6 +18,7 @@ struct synth_field {
bool is_signed;
bool is_string;
bool is_dynamic;
+ bool is_stack;
};
struct synth_event {
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index fb58e86dd117..8d64b6553aed 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -691,7 +691,8 @@ static int __trace_uprobe_create(int argc, const char **argv)
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
trace_probe_log_set_index(i + 2);
ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i],
- is_return ? TPARG_FL_RETURN : 0);
+ (is_return ? TPARG_FL_RETURN : 0) |
+ TPARG_FL_USER);
if (ret)
goto error;
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index f23144af5743..8d1507dd0724 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -571,8 +571,8 @@ static void for_each_tracepoint_range(
bool trace_module_has_bad_taint(struct module *mod)
{
return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
- (1 << TAINT_UNSIGNED_MODULE) |
- (1 << TAINT_TEST));
+ (1 << TAINT_UNSIGNED_MODULE) | (1 << TAINT_TEST) |
+ (1 << TAINT_LIVEPATCH));
}
static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
diff --git a/kernel/umh.c b/kernel/umh.c
index 850631518665..fbf872c624cb 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -438,21 +438,27 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
- if (wait & UMH_KILLABLE)
- state |= TASK_KILLABLE;
-
if (wait & UMH_FREEZABLE)
state |= TASK_FREEZABLE;
- retval = wait_for_completion_state(&done, state);
- if (!retval)
- goto wait_done;
-
if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_state(&done, state | TASK_KILLABLE);
+ if (!retval)
+ goto wait_done;
+
/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
+
+ /*
+ * fallthrough; in case of -ERESTARTSYS now do uninterruptible
+ * wait_for_completion_state(). Since umh_complete() shall call
+ * complete() in a moment if xchg() above returned NULL, this
+ * uninterruptible wait_for_completion_state() will not block
+ * SIGKILL'ed processes for long.
+ */
}
+ wait_for_completion_state(&done, state);
wait_done:
retval = sub_info->retval;
diff --git a/kernel/user.c b/kernel/user.c
index e2cf8c22b539..d667debeafd6 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -185,6 +185,7 @@ void free_uid(struct user_struct *up)
if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags))
free_user(up, flags);
}
+EXPORT_SYMBOL_GPL(free_uid);
struct user_struct *alloc_uid(kuid_t uid)
{
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 07895deca271..b8b541caed48 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -169,7 +169,9 @@ struct worker_pool {
struct list_head idle_list; /* L: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
- struct timer_list mayday_timer; /* L: SOS timer for workers */
+ struct work_struct idle_cull_work; /* L: worker idle cleanup */
+
+ struct timer_list mayday_timer; /* L: SOS timer for workers */
/* a workers is either on busy_hash or idle_list, or the manager */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
@@ -177,6 +179,7 @@ struct worker_pool {
struct worker *manager; /* L: purely informational */
struct list_head workers; /* A: attached workers */
+ struct list_head dying_workers; /* A: workers about to die */
struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
@@ -326,7 +329,7 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
-/* PL: allowable cpus for unbound wqs and work items */
+/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
/* CPU where unbound work was last round robin scheduled from this CPU */
@@ -1433,9 +1436,13 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
lockdep_assert_irqs_disabled();
- /* if draining, only works from the same workqueue are allowed */
- if (unlikely(wq->flags & __WQ_DRAINING) &&
- WARN_ON_ONCE(!is_chained_work(wq)))
+ /*
+ * For a draining wq, only works from the same workqueue are
+ * allowed. The __WQ_DESTROYING helps to spot the issue that
+ * queues a new work item to a wq after destroy_workqueue(wq).
+ */
+ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
+ WARN_ON_ONCE(!is_chained_work(wq))))
return;
rcu_read_lock();
retry:
@@ -1900,7 +1907,7 @@ static void worker_detach_from_pool(struct worker *worker)
list_del(&worker->node);
worker->pool = NULL;
- if (list_empty(&pool->workers))
+ if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
detach_completion = pool->detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
@@ -1972,21 +1979,55 @@ fail:
return NULL;
}
+static void unbind_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ kthread_set_per_cpu(worker->task, -1);
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ else
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+}
+
+static void wake_dying_workers(struct list_head *cull_list)
+{
+ struct worker *worker, *tmp;
+
+ list_for_each_entry_safe(worker, tmp, cull_list, entry) {
+ list_del_init(&worker->entry);
+ unbind_worker(worker);
+ /*
+ * If the worker was somehow already running, then it had to be
+ * in pool->idle_list when set_worker_dying() happened or we
+ * wouldn't have gotten here.
+ *
+ * Thus, the worker must either have observed the WORKER_DIE
+ * flag, or have set its state to TASK_IDLE. Either way, the
+ * below will be observed by the worker and is safe to do
+ * outside of pool->lock.
+ */
+ wake_up_process(worker->task);
+ }
+}
+
/**
- * destroy_worker - destroy a workqueue worker
+ * set_worker_dying - Tag a worker for destruction
* @worker: worker to be destroyed
+ * @list: transfer worker away from its pool->idle_list and into list
*
- * Destroy @worker and adjust @pool stats accordingly. The worker should
- * be idle.
+ * Tag @worker for destruction and adjust @pool stats accordingly. The worker
+ * should be idle.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
-static void destroy_worker(struct worker *worker)
+static void set_worker_dying(struct worker *worker, struct list_head *list)
{
struct worker_pool *pool = worker->pool;
lockdep_assert_held(&pool->lock);
+ lockdep_assert_held(&wq_pool_attach_mutex);
/* sanity check frenzy */
if (WARN_ON(worker->current_work) ||
@@ -1997,34 +2038,94 @@ static void destroy_worker(struct worker *worker)
pool->nr_workers--;
pool->nr_idle--;
- list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
- wake_up_process(worker->task);
+
+ list_move(&worker->entry, list);
+ list_move(&worker->node, &pool->dying_workers);
}
+/**
+ * idle_worker_timeout - check if some idle workers can now be deleted.
+ * @t: The pool's idle_timer that just expired
+ *
+ * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
+ * worker_leave_idle(), as a worker flicking between idle and active while its
+ * pool is at the too_many_workers() tipping point would cause too much timer
+ * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
+ * it expire and re-evaluate things from there.
+ */
static void idle_worker_timeout(struct timer_list *t)
{
struct worker_pool *pool = from_timer(pool, t, idle_timer);
+ bool do_cull = false;
+
+ if (work_pending(&pool->idle_cull_work))
+ return;
raw_spin_lock_irq(&pool->lock);
- while (too_many_workers(pool)) {
+ if (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
/* idle_list is kept in LIFO order, check the last one */
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+ do_cull = !time_before(jiffies, expires);
+
+ if (!do_cull)
+ mod_timer(&pool->idle_timer, expires);
+ }
+ raw_spin_unlock_irq(&pool->lock);
+
+ if (do_cull)
+ queue_work(system_unbound_wq, &pool->idle_cull_work);
+}
+
+/**
+ * idle_cull_fn - cull workers that have been idle for too long.
+ * @work: the pool's work for handling these idle workers
+ *
+ * This goes through a pool's idle workers and gets rid of those that have been
+ * idle for at least IDLE_WORKER_TIMEOUT seconds.
+ *
+ * We don't want to disturb isolated CPUs because of a pcpu kworker being
+ * culled, so this also resets worker affinity. This requires a sleepable
+ * context, hence the split between timer callback and work item.
+ */
+static void idle_cull_fn(struct work_struct *work)
+{
+ struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
+ struct list_head cull_list;
+
+ INIT_LIST_HEAD(&cull_list);
+ /*
+ * Grabbing wq_pool_attach_mutex here ensures an already-running worker
+ * cannot proceed beyong worker_detach_from_pool() in its self-destruct
+ * path. This is required as a previously-preempted worker could run after
+ * set_worker_dying() has happened but before wake_dying_workers() did.
+ */
+ mutex_lock(&wq_pool_attach_mutex);
+ raw_spin_lock_irq(&pool->lock);
+
+ while (too_many_workers(pool)) {
+ struct worker *worker;
+ unsigned long expires;
+
+ worker = list_entry(pool->idle_list.prev, struct worker, entry);
+ expires = worker->last_active + IDLE_WORKER_TIMEOUT;
if (time_before(jiffies, expires)) {
mod_timer(&pool->idle_timer, expires);
break;
}
- destroy_worker(worker);
+ set_worker_dying(worker, &cull_list);
}
raw_spin_unlock_irq(&pool->lock);
+ wake_dying_workers(&cull_list);
+ mutex_unlock(&wq_pool_attach_mutex);
}
static void send_mayday(struct work_struct *work)
@@ -2388,12 +2489,12 @@ woke_up:
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
raw_spin_unlock_irq(&pool->lock);
- WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);
set_task_comm(worker->task, "kworker/dying");
ida_free(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker);
+ WARN_ON_ONCE(!list_empty(&worker->entry));
kfree(worker);
return 0;
}
@@ -3462,10 +3563,12 @@ static int init_worker_pool(struct worker_pool *pool)
hash_init(pool->busy_hash);
timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
+ INIT_WORK(&pool->idle_cull_work, idle_cull_fn);
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
INIT_LIST_HEAD(&pool->workers);
+ INIT_LIST_HEAD(&pool->dying_workers);
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
@@ -3540,18 +3643,6 @@ static void rcu_free_pool(struct rcu_head *rcu)
kfree(pool);
}
-/* This returns with the lock held on success (pool manager is inactive). */
-static bool wq_manager_inactive(struct worker_pool *pool)
-{
- raw_spin_lock_irq(&pool->lock);
-
- if (pool->flags & POOL_MANAGER_ACTIVE) {
- raw_spin_unlock_irq(&pool->lock);
- return false;
- }
- return true;
-}
-
/**
* put_unbound_pool - put a worker_pool
* @pool: worker_pool to put
@@ -3566,8 +3657,11 @@ static bool wq_manager_inactive(struct worker_pool *pool)
static void put_unbound_pool(struct worker_pool *pool)
{
DECLARE_COMPLETION_ONSTACK(detach_completion);
+ struct list_head cull_list;
struct worker *worker;
+ INIT_LIST_HEAD(&cull_list);
+
lockdep_assert_held(&wq_pool_mutex);
if (--pool->refcnt)
@@ -3587,20 +3681,38 @@ static void put_unbound_pool(struct worker_pool *pool)
* Become the manager and destroy all workers. This prevents
* @pool's workers from blocking on attach_mutex. We're the last
* manager and @pool gets freed with the flag set.
- * Because of how wq_manager_inactive() works, we will hold the
- * spinlock after a successful wait.
+ *
+ * Having a concurrent manager is quite unlikely to happen as we can
+ * only get here with
+ * pwq->refcnt == pool->refcnt == 0
+ * which implies no work queued to the pool, which implies no worker can
+ * become the manager. However a worker could have taken the role of
+ * manager before the refcnts dropped to 0, since maybe_create_worker()
+ * drops pool->lock
*/
- rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool),
- TASK_UNINTERRUPTIBLE);
- pool->flags |= POOL_MANAGER_ACTIVE;
+ while (true) {
+ rcuwait_wait_event(&manager_wait,
+ !(pool->flags & POOL_MANAGER_ACTIVE),
+ TASK_UNINTERRUPTIBLE);
+
+ mutex_lock(&wq_pool_attach_mutex);
+ raw_spin_lock_irq(&pool->lock);
+ if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
+ pool->flags |= POOL_MANAGER_ACTIVE;
+ break;
+ }
+ raw_spin_unlock_irq(&pool->lock);
+ mutex_unlock(&wq_pool_attach_mutex);
+ }
while ((worker = first_idle_worker(pool)))
- destroy_worker(worker);
+ set_worker_dying(worker, &cull_list);
WARN_ON(pool->nr_workers || pool->nr_idle);
raw_spin_unlock_irq(&pool->lock);
- mutex_lock(&wq_pool_attach_mutex);
- if (!list_empty(&pool->workers))
+ wake_dying_workers(&cull_list);
+
+ if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
pool->detach_completion = &detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
@@ -3609,6 +3721,7 @@ static void put_unbound_pool(struct worker_pool *pool)
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
+ cancel_work_sync(&pool->idle_cull_work);
del_timer_sync(&pool->mayday_timer);
/* RCU protected to allow dereferences from get_work_pool() */
@@ -3952,7 +4065,8 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
- const struct workqueue_attrs *attrs)
+ const struct workqueue_attrs *attrs,
+ const cpumask_var_t unbound_cpumask)
{
struct apply_wqattrs_ctx *ctx;
struct workqueue_attrs *new_attrs, *tmp_attrs;
@@ -3968,14 +4082,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
goto out_free;
/*
- * Calculate the attrs of the default pwq.
+ * Calculate the attrs of the default pwq with unbound_cpumask
+ * which is wq_unbound_cpumask or to set to wq_unbound_cpumask.
* If the user configured cpumask doesn't overlap with the
* wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
*/
copy_workqueue_attrs(new_attrs, attrs);
- cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask);
if (unlikely(cpumask_empty(new_attrs->cpumask)))
- cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
+ cpumask_copy(new_attrs->cpumask, unbound_cpumask);
/*
* We may create multiple pwqs with differing cpumasks. Make a
@@ -4072,7 +4187,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
wq->flags &= ~__WQ_ORDERED;
}
- ctx = apply_wqattrs_prepare(wq, attrs);
+ ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
if (!ctx)
return -ENOMEM;
@@ -4414,6 +4529,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
*/
workqueue_sysfs_unregister(wq);
+ /* mark the workqueue destruction is in progress */
+ mutex_lock(&wq->mutex);
+ wq->flags |= __WQ_DESTROYING;
+ mutex_unlock(&wq->mutex);
+
/* drain it before proceeding with destruction */
drain_workqueue(wq);
@@ -4709,22 +4829,53 @@ static void pr_cont_pool_info(struct worker_pool *pool)
pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
}
-static void pr_cont_work(bool comma, struct work_struct *work)
+struct pr_cont_work_struct {
+ bool comma;
+ work_func_t func;
+ long ctr;
+};
+
+static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
+{
+ if (!pcwsp->ctr)
+ goto out_record;
+ if (func == pcwsp->func) {
+ pcwsp->ctr++;
+ return;
+ }
+ if (pcwsp->ctr == 1)
+ pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
+ else
+ pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
+ pcwsp->ctr = 0;
+out_record:
+ if ((long)func == -1L)
+ return;
+ pcwsp->comma = comma;
+ pcwsp->func = func;
+ pcwsp->ctr = 1;
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
{
if (work->func == wq_barrier_func) {
struct wq_barrier *barr;
barr = container_of(work, struct wq_barrier, work);
+ pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
pr_cont("%s BAR(%d)", comma ? "," : "",
task_pid_nr(barr->task));
} else {
- pr_cont("%s %ps", comma ? "," : "", work->func);
+ if (!comma)
+ pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
+ pr_cont_work_flush(comma, work->func, pcwsp);
}
}
static void show_pwq(struct pool_workqueue *pwq)
{
+ struct pr_cont_work_struct pcws = { .ctr = 0, };
struct worker_pool *pool = pwq->pool;
struct work_struct *work;
struct worker *worker;
@@ -4757,7 +4908,8 @@ static void show_pwq(struct pool_workqueue *pwq)
worker->rescue_wq ? "(RESCUER)" : "",
worker->current_func);
list_for_each_entry(work, &worker->scheduled, entry)
- pr_cont_work(false, work);
+ pr_cont_work(false, work, &pcws);
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
comma = true;
}
pr_cont("\n");
@@ -4777,9 +4929,10 @@ static void show_pwq(struct pool_workqueue *pwq)
if (get_work_pwq(work) != pwq)
continue;
- pr_cont_work(comma, work);
+ pr_cont_work(comma, work, &pcws);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
pr_cont("\n");
}
@@ -4788,9 +4941,10 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_info(" inactive:");
list_for_each_entry(work, &pwq->inactive_works, entry) {
- pr_cont_work(comma, work);
+ pr_cont_work(comma, work, &pcws);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
+ pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
pr_cont("\n");
}
}
@@ -5006,13 +5160,8 @@ static void unbind_workers(int cpu)
raw_spin_unlock_irq(&pool->lock);
- for_each_pool_worker(worker, pool) {
- kthread_set_per_cpu(worker->task, -1);
- if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
- else
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
- }
+ for_each_pool_worker(worker, pool)
+ unbind_worker(worker);
mutex_unlock(&wq_pool_attach_mutex);
}
@@ -5334,7 +5483,7 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */
-static int workqueue_apply_unbound_cpumask(void)
+static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
LIST_HEAD(ctxs);
int ret = 0;
@@ -5350,7 +5499,7 @@ static int workqueue_apply_unbound_cpumask(void)
if (wq->flags & __WQ_ORDERED)
continue;
- ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
+ ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
if (!ctx) {
ret = -ENOMEM;
break;
@@ -5365,6 +5514,11 @@ static int workqueue_apply_unbound_cpumask(void)
apply_wqattrs_cleanup(ctx);
}
+ if (!ret) {
+ mutex_lock(&wq_pool_attach_mutex);
+ cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
+ mutex_unlock(&wq_pool_attach_mutex);
+ }
return ret;
}
@@ -5383,7 +5537,6 @@ static int workqueue_apply_unbound_cpumask(void)
int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
int ret = -EINVAL;
- cpumask_var_t saved_cpumask;
/*
* Not excluding isolated cpus on purpose.
@@ -5397,23 +5550,8 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
goto out_unlock;
}
- if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- /* save the old wq_unbound_cpumask. */
- cpumask_copy(saved_cpumask, wq_unbound_cpumask);
-
- /* update wq_unbound_cpumask at first and apply it to wqs. */
- cpumask_copy(wq_unbound_cpumask, cpumask);
- ret = workqueue_apply_unbound_cpumask();
-
- /* restore the wq_unbound_cpumask when failed. */
- if (ret < 0)
- cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+ ret = workqueue_apply_unbound_cpumask(cpumask);
- free_cpumask_var(saved_cpumask);
out_unlock:
apply_wqattrs_unlock();
}