summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt12
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/bpf/Kconfig6
-rw-r--r--kernel/bpf/arraymap.c4
-rw-r--r--kernel/bpf/bpf_inode_storage.c9
-rw-r--r--kernel/bpf/bpf_iter.c20
-rw-r--r--kernel/bpf/bpf_local_storage.c60
-rw-r--r--kernel/bpf/bpf_lsm.c21
-rw-r--r--kernel/bpf/bpf_task_storage.c10
-rw-r--r--kernel/bpf/btf.c683
-rw-r--r--kernel/bpf/cgroup.c187
-rw-r--r--kernel/bpf/core.c364
-rw-r--r--kernel/bpf/cpumap.c8
-rw-r--r--kernel/bpf/devmap.c3
-rw-r--r--kernel/bpf/hashtab.c2
-rw-r--r--kernel/bpf/helpers.c45
-rw-r--r--kernel/bpf/inode.c39
-rw-r--r--kernel/bpf/local_storage.c2
-rw-r--r--kernel/bpf/preload/Kconfig7
-rw-r--r--kernel/bpf/preload/Makefile41
-rw-r--r--kernel/bpf/preload/bpf_preload.h8
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c126
-rw-r--r--kernel/bpf/preload/bpf_preload_umd_blob.S7
-rw-r--r--kernel/bpf/preload/iterators/Makefile6
-rw-r--r--kernel/bpf/preload/iterators/bpf_preload_common.h13
-rw-r--r--kernel/bpf/preload/iterators/iterators.c94
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel.h425
-rw-r--r--kernel/bpf/preload/iterators/iterators.skel.h412
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/stackmap.c68
-rw-r--r--kernel/bpf/syscall.c97
-rw-r--r--kernel/bpf/trampoline.c28
-rw-r--r--kernel/bpf/verifier.c468
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup/cgroup.c6
-rw-r--r--kernel/cgroup/cpuset.c17
-rw-r--r--kernel/cgroup/freezer.c2
-rw-r--r--kernel/cgroup/rstat.c18
-rw-r--r--kernel/configs/android-recommended.config2
-rw-r--r--kernel/configs/x86_debug.config18
-rw-r--r--kernel/cpu.c65
-rw-r--r--kernel/crash_core.c3
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/debug/debug_core.c24
-rw-r--r--kernel/debug/kdb/kdb_main.c62
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/dma/Kconfig7
-rw-r--r--kernel/dma/Makefile2
-rw-r--r--kernel/dma/debug.c4
-rw-r--r--kernel/dma/direct.c28
-rw-r--r--kernel/dma/direct.h3
-rw-r--r--kernel/dma/map_benchmark.c25
-rw-r--r--kernel/dma/mapping.c4
-rw-r--r--kernel/dma/swiotlb.c87
-rw-r--r--kernel/entry/common.c43
-rw-r--r--kernel/entry/kvm.c9
-rw-r--r--kernel/events/callchain.c4
-rw-r--r--kernel/events/core.c235
-rw-r--r--kernel/events/internal.h5
-rw-r--r--kernel/events/ring_buffer.c5
-rw-r--r--kernel/events/uprobes.c7
-rw-r--r--kernel/exit.c21
-rw-r--r--kernel/extable.c24
-rw-r--r--kernel/fork.c19
-rw-r--r--kernel/futex/pi.c2
-rw-r--r--kernel/irq/affinity.c7
-rw-r--r--kernel/irq/chip.c13
-rw-r--r--kernel/irq/debugfs.c1
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irq_sim.c2
-rw-r--r--kernel/irq/irqdesc.c3
-rw-r--r--kernel/irq/manage.c70
-rw-r--r--kernel/irq/matrix.c2
-rw-r--r--kernel/irq/msi.c15
-rw-r--r--kernel/irq_work.c2
-rw-r--r--kernel/kallsyms.c4
-rw-r--r--kernel/kcov.c99
-rw-r--r--kernel/kcsan/kcsan_test.c5
-rw-r--r--kernel/kexec_core.c2
-rw-r--r--kernel/kprobes.c192
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/livepatch/core.c4
-rw-r--r--kernel/livepatch/patch.c19
-rw-r--r--kernel/livepatch/transition.c12
-rw-r--r--kernel/locking/lockdep.c23
-rw-r--r--kernel/locking/mutex.c18
-rw-r--r--kernel/locking/percpu-rwsem.c5
-rw-r--r--kernel/locking/qrwlock.c17
-rw-r--r--kernel/locking/qspinlock.c5
-rw-r--r--kernel/locking/rtmutex.c11
-rw-r--r--kernel/locking/rwbase_rt.c7
-rw-r--r--kernel/locking/rwsem.c130
-rw-r--r--kernel/locking/semaphore.c15
-rw-r--r--kernel/panic.c39
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/energy_model.c65
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/power/snapshot.c12
-rw-r--r--kernel/ptrace.c47
-rw-r--r--kernel/rcu/Kconfig73
-rw-r--r--kernel/rcu/Kconfig.debug23
-rw-r--r--kernel/rcu/rcu.h13
-rw-r--r--kernel/rcu/rcu_segcblist.c8
-rw-r--r--kernel/rcu/rcuscale.c22
-rw-r--r--kernel/rcu/rcutorture.c129
-rw-r--r--kernel/rcu/refscale.c22
-rw-r--r--kernel/rcu/srcutree.c639
-rw-r--r--kernel/rcu/sync.c2
-rw-r--r--kernel/rcu/tasks.h89
-rw-r--r--kernel/rcu/tree.c146
-rw-r--r--kernel/rcu/tree.h9
-rw-r--r--kernel/rcu/tree_exp.h151
-rw-r--r--kernel/rcu/tree_nocb.h39
-rw-r--r--kernel/rcu/tree_plugin.h28
-rw-r--r--kernel/rcu/tree_stall.h36
-rw-r--r--kernel/rcu/update.c2
-rw-r--r--kernel/resource.c41
-rw-r--r--kernel/scftorture.c5
-rw-r--r--kernel/sched/build_policy.c2
-rw-r--r--kernel/sched/build_utility.c1
-rw-r--r--kernel/sched/clock.c4
-rw-r--r--kernel/sched/core.c53
-rw-r--r--kernel/sched/deadline.c15
-rw-r--r--kernel/sched/fair.c303
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/psi.c18
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h61
-rw-r--r--kernel/sched/smp.h6
-rw-r--r--kernel/scs.c12
-rw-r--r--kernel/seccomp.c45
-rw-r--r--kernel/signal.c120
-rw-r--r--kernel/smp.c41
-rw-r--r--kernel/smpboot.c7
-rw-r--r--kernel/softirq.c13
-rw-r--r--kernel/stackleak.c105
-rw-r--r--kernel/stacktrace.c3
-rw-r--r--kernel/static_call.c542
-rw-r--r--kernel/static_call_inline.c543
-rw-r--r--kernel/stop_machine.c23
-rw-r--r--kernel/sys.c131
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/task_work.c29
-rw-r--r--kernel/taskstats.c5
-rw-r--r--kernel/time/clockevents.c9
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/sched_clock.c13
-rw-r--r--kernel/time/tick-sched.c18
-rw-r--r--kernel/time/timekeeping.c58
-rw-r--r--kernel/time/timer.c101
-rw-r--r--kernel/trace/Kconfig43
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/blktrace.c27
-rw-r--r--kernel/trace/bpf_trace.c357
-rw-r--r--kernel/trace/fgraph.c31
-rw-r--r--kernel/trace/fprobe.c332
-rw-r--r--kernel/trace/ftrace.c112
-rw-r--r--kernel/trace/rethook.c317
-rw-r--r--kernel/trace/trace.c96
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events.c98
-rw-r--r--kernel/trace/trace_events_hist.c33
-rw-r--r--kernel/trace/trace_events_synth.c14
-rw-r--r--kernel/trace/trace_events_user.c1628
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_osnoise.c4
-rw-r--r--kernel/trace/trace_preemptirq.c4
-rw-r--r--kernel/trace/trace_sched_switch.c6
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/watch_queue.c1
-rw-r--r--kernel/workqueue.c58
176 files changed, 8334 insertions, 3576 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 8c6de5a9ecc4..c2f1fd95a821 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -133,14 +133,4 @@ config SCHED_CORE
which is the likely usage by Linux distributions, there should
be no measurable impact on performance.
-config ARCH_WANTS_RT_DELAYED_SIGNALS
- bool
- help
- This option is selected by architectures where raising signals
- can happen in atomic contexts on PREEMPT_RT enabled kernels. This
- option delays raising the signal until the return to user space
- loop where it is also delivered. X86 requires this to deliver
- signals from trap handlers which run on IST stacks.
-
-config RT_DELAYED_SIGNALS
- def_bool PREEMPT_RT && ARCH_WANTS_RT_DELAYED_SIGNALS
+
diff --git a/kernel/Makefile b/kernel/Makefile
index 56f4ee97f328..847a82bfe0e3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,12 +108,14 @@ obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_RETHOOK) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_KCSAN) += kcsan/
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
-obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ea2ee1181921..f3a2abd6d1a1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1959,6 +1959,12 @@ void __audit_uring_exit(int success, long code)
{
struct audit_context *ctx = audit_context();
+ if (ctx->dummy) {
+ if (ctx->context != AUDIT_CTX_URING)
+ return;
+ goto out;
+ }
+
if (ctx->context == AUDIT_CTX_SYSCALL) {
/*
* NOTE: See the note in __audit_uring_entry() about the case
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index d24d518ddd63..2dfe1079f772 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -27,9 +27,11 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
+ select TASKS_RCU if PREEMPTION
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
+ select PAGE_POOL if NET
default n
help
Enable the bpf() system call that allows to manipulate BPF programs
@@ -58,6 +60,10 @@ config BPF_JIT_ALWAYS_ON
Enables BPF JIT and removes BPF interpreter to avoid speculative
execution of BPF instructions by the interpreter.
+ When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable
+ is permanently set to 1 and setting any other value than that will
+ return failure.
+
config BPF_JIT_DEFAULT_ON
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
depends on HAVE_EBPF_JIT && BPF_JIT
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c7a5be3bf8be..7f145aefbff8 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -837,13 +837,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
if (IS_ERR(prog))
return prog;
- if (!bpf_prog_array_compatible(array, prog)) {
+ if (!bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
@@ -1071,7 +1070,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
- spin_lock_init(&aux->owner.lock);
map = array_map_alloc(attr);
if (IS_ERR(map)) {
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e29d9e3d853e..96be8d518885 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(f->f_inode,
(struct bpf_local_storage_map *)map,
- value, map_flags);
+ value, map_flags, GFP_ATOMIC);
fput(f);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
-BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
- void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
@@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST);
+ BPF_NOEXIST, gfp_flags);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index b7aef5b3416d..110029ede71e 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -5,6 +5,7 @@
#include <linux/anon_inodes.h>
#include <linux/filter.h>
#include <linux/bpf.h>
+#include <linux/rcupdate_trace.h>
struct bpf_iter_target_info {
struct list_head list;
@@ -684,11 +685,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
int ret;
- rcu_read_lock();
- migrate_disable();
- ret = bpf_prog_run(prog, ctx);
- migrate_enable();
- rcu_read_unlock();
+ if (prog->aux->sleepable) {
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+ ret = bpf_prog_run(prog, ctx);
+ migrate_enable();
+ rcu_read_unlock_trace();
+ } else {
+ rcu_read_lock();
+ migrate_disable();
+ ret = bpf_prog_run(prog, ctx);
+ migrate_enable();
+ rcu_read_unlock();
+ }
/* bpf program can only return 0 or 1:
* 0 : okay
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 71de2a89869c..01aa2b51ec4d 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool charge_mem)
+ void *value, bool charge_mem, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
@@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- GFP_ATOMIC | __GFP_NOWARN);
+ gfp_flags | __GFP_NOWARN);
if (selem) {
if (value)
memcpy(SDATA(selem)->data, value, smap->map.value_size);
@@ -136,7 +136,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
* will be done by the caller.
*
* Although the unlock will be done under
- * rcu_read_lock(), it is more intutivie to
+ * rcu_read_lock(), it is more intuitive to
* read if the freeing of the storage is done
* after the raw_spin_unlock_bh(&local_storage->lock).
*
@@ -282,7 +282,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
int bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem)
+ struct bpf_local_storage_elem *first_selem,
+ gfp_t gfp_flags)
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
@@ -293,7 +294,7 @@ int bpf_local_storage_alloc(void *owner,
return err;
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- GFP_ATOMIC | __GFP_NOWARN);
+ gfp_flags | __GFP_NOWARN);
if (!storage) {
err = -ENOMEM;
goto uncharge;
@@ -350,10 +351,10 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags)
+ void *value, u64 map_flags, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
- struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_elem *selem = NULL;
struct bpf_local_storage *local_storage;
unsigned long flags;
int err;
@@ -365,6 +366,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
!map_value_has_spin_lock(&smap->map)))
return ERR_PTR(-EINVAL);
+ if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
+ return ERR_PTR(-EINVAL);
+
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
@@ -373,11 +377,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true);
+ selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
- err = bpf_local_storage_alloc(owner, smap, selem);
+ err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
kfree(selem);
mem_uncharge(smap, owner, smap->elem_size);
@@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
}
}
+ if (gfp_flags == GFP_KERNEL) {
+ selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ if (!selem)
+ return ERR_PTR(-ENOMEM);
+ }
+
raw_spin_lock_irqsave(&local_storage->lock, flags);
/* Recheck local_storage->list under local_storage->lock */
@@ -429,19 +439,21 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
goto unlock;
}
- /* local_storage->lock is held. Hence, we are sure
- * we can unlink and uncharge the old_sdata successfully
- * later. Hence, instead of charging the new selem now
- * and then uncharge the old selem later (which may cause
- * a potential but unnecessary charge failure), avoid taking
- * a charge at all here (the "!old_sdata" check) and the
- * old_sdata will not be uncharged later during
- * bpf_selem_unlink_storage_nolock().
- */
- selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
- if (!selem) {
- err = -ENOMEM;
- goto unlock_err;
+ if (gfp_flags != GFP_KERNEL) {
+ /* local_storage->lock is held. Hence, we are sure
+ * we can unlink and uncharge the old_sdata successfully
+ * later. Hence, instead of charging the new selem now
+ * and then uncharge the old selem later (which may cause
+ * a potential but unnecessary charge failure), avoid taking
+ * a charge at all here (the "!old_sdata" check) and the
+ * old_sdata will not be uncharged later during
+ * bpf_selem_unlink_storage_nolock().
+ */
+ selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags);
+ if (!selem) {
+ err = -ENOMEM;
+ goto unlock_err;
+ }
}
/* First, link the new selem to the map */
@@ -463,6 +475,10 @@ unlock:
unlock_err:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ if (selem) {
+ mem_uncharge(smap, owner, smap->elem_size);
+ kfree(selem);
+ }
return ERR_PTR(err);
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 9e4ecc990647..064eccba641d 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -99,6 +99,24 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.allowed = bpf_ima_inode_hash_allowed,
};
+BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
+{
+ return ima_file_hash(file, dst, size);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
+
+static const struct bpf_func_proto bpf_ima_file_hash_proto = {
+ .func = bpf_ima_file_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .allowed = bpf_ima_inode_hash_allowed,
+};
+
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -121,6 +139,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+ case BPF_FUNC_ima_file_hash:
+ return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
default:
return tracing_prog_func_proto(func_id, prog);
}
@@ -167,6 +187,7 @@ BTF_ID(func, bpf_lsm_inode_setxattr)
BTF_ID(func, bpf_lsm_inode_symlink)
BTF_ID(func, bpf_lsm_inode_unlink)
BTF_ID(func, bpf_lsm_kernel_module_request)
+BTF_ID(func, bpf_lsm_kernel_read_file)
BTF_ID(func, bpf_lsm_kernfs_init_security)
#ifdef CONFIG_KEYS
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 5da7bed0f5f6..6638a0ecc3d2 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
bpf_task_storage_lock();
sdata = bpf_local_storage_update(
- task, (struct bpf_local_storage_map *)map, value, map_flags);
+ task, (struct bpf_local_storage_map *)map, value, map_flags,
+ GFP_ATOMIC);
bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
@@ -226,8 +227,9 @@ out:
return err;
}
-BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
@@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST);
+ BPF_NOEXIST, gfp_flags);
unlock:
bpf_task_storage_unlock();
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3e23b3fa79ff..0918a39279f6 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2018 Facebook */
#include <uapi/linux/btf.h>
@@ -198,6 +198,21 @@
DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
+enum btf_kfunc_hook {
+ BTF_KFUNC_HOOK_XDP,
+ BTF_KFUNC_HOOK_TC,
+ BTF_KFUNC_HOOK_STRUCT_OPS,
+ BTF_KFUNC_HOOK_MAX,
+};
+
+enum {
+ BTF_KFUNC_SET_MAX_CNT = 32,
+};
+
+struct btf_kfunc_set_tab {
+ struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX];
+};
+
struct btf {
void *data;
struct btf_type **types;
@@ -212,6 +227,7 @@ struct btf {
refcount_t refcnt;
u32 id;
struct rcu_head rcu;
+ struct btf_kfunc_set_tab *kfunc_set_tab;
/* split BTF support */
struct btf *base_btf;
@@ -403,6 +419,9 @@ static struct btf_type btf_void;
static int btf_resolve(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id);
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t);
+
static bool btf_type_is_modifier(const struct btf_type *t)
{
/* Some of them is not strictly a C modifier
@@ -506,6 +525,50 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
return -ENOENT;
}
+static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+{
+ struct btf *btf;
+ s32 ret;
+ int id;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -EINVAL;
+
+ ret = btf_find_by_name_kind(btf, name, kind);
+ /* ret is never zero, since btf_find_by_name_kind returns
+ * positive btf_id or negative error.
+ */
+ if (ret > 0) {
+ btf_get(btf);
+ *btf_p = btf;
+ return ret;
+ }
+
+ /* If name is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, btf, id) {
+ if (!btf_is_module(btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(btf);
+ spin_unlock_bh(&btf_idr_lock);
+ ret = btf_find_by_name_kind(btf, name, kind);
+ if (ret > 0) {
+ *btf_p = btf;
+ return ret;
+ }
+ spin_lock_bh(&btf_idr_lock);
+ btf_put(btf);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ return ret;
+}
+
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id)
{
@@ -579,6 +642,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
btf_type_is_struct(t) ||
btf_type_is_array(t) ||
btf_type_is_var(t) ||
+ btf_type_is_func(t) ||
btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -1531,8 +1595,30 @@ static void btf_free_id(struct btf *btf)
spin_unlock_irqrestore(&btf_idr_lock, flags);
}
+static void btf_free_kfunc_set_tab(struct btf *btf)
+{
+ struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
+ int hook, type;
+
+ if (!tab)
+ return;
+ /* For module BTF, we directly assign the sets being registered, so
+ * there is nothing to free except kfunc_set_tab.
+ */
+ if (btf_is_module(btf))
+ goto free_tab;
+ for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) {
+ for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++)
+ kfree(tab->sets[hook][type]);
+ }
+free_tab:
+ kfree(tab);
+ btf->kfunc_set_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
+ btf_free_kfunc_set_tab(btf);
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
@@ -2505,7 +2591,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
*
* We now need to continue from the last-resolved-ptr to
* ensure the last-resolved-ptr will not referring back to
- * the currenct ptr (t).
+ * the current ptr (t).
*/
if (btf_type_is_modifier(next_type)) {
const struct btf_type *resolved_type;
@@ -3533,9 +3619,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
return 0;
}
+static int btf_func_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ int err;
+
+ err = btf_func_check(env, t);
+ if (err)
+ return err;
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+ return 0;
+}
+
static struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
- .resolve = btf_df_resolve,
+ .resolve = btf_func_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_ref_type_log,
@@ -4156,7 +4257,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return !btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
- if (btf_type_is_decl_tag(t))
+ if (btf_type_is_decl_tag(t) || btf_type_is_func(t))
return btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
@@ -4246,12 +4347,6 @@ static int btf_check_all_types(struct btf_verifier_env *env)
if (err)
return err;
}
-
- if (btf_type_is_func(t)) {
- err = btf_func_check(env, t);
- if (err)
- return err;
- }
}
return 0;
@@ -4387,8 +4482,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
btf = env->btf;
btf_data_size = btf->data_size;
- if (btf_data_size <
- offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) {
+ if (btf_data_size < offsetofend(struct btf_header, hdr_len)) {
btf_verifier_log(env, "hdr_len not found");
return -EINVAL;
}
@@ -4848,6 +4942,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
+ const char *tag_value;
u32 nr_args, arg;
int i, ret;
@@ -5000,6 +5095,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->btf = btf;
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_type_tag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tag_value, "user") == 0)
+ info->reg_type |= MEM_USER;
+ if (strcmp(tag_value, "percpu") == 0)
+ info->reg_type |= MEM_PERCPU;
+ }
+
/* skip modifiers */
while (btf_type_is_modifier(t)) {
info->btf_id = t->type;
@@ -5026,12 +5130,12 @@ enum bpf_struct_walk_result {
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
- u32 *next_btf_id)
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
- const char *tname, *mname;
+ const char *tname, *mname, *tag_value;
u32 vlen, elem_id, mid;
again:
@@ -5215,7 +5319,8 @@ error:
}
if (btf_type_is_ptr(mtype)) {
- const struct btf_type *stype;
+ const struct btf_type *stype, *t;
+ enum bpf_type_flag tmp_flag = 0;
u32 id;
if (msize != size || off != moff) {
@@ -5224,9 +5329,23 @@ error:
mname, moff, tname, off, size);
return -EACCES;
}
+
+ /* check type tag */
+ t = btf_type_by_id(btf, mtype->type);
+ if (btf_type_is_type_tag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ /* check __user tag */
+ if (strcmp(tag_value, "user") == 0)
+ tmp_flag = MEM_USER;
+ /* check __percpu tag */
+ if (strcmp(tag_value, "percpu") == 0)
+ tmp_flag = MEM_PERCPU;
+ }
+
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
+ *flag = tmp_flag;
return WALK_PTR;
}
}
@@ -5253,13 +5372,14 @@ error:
int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
enum bpf_access_type atype __maybe_unused,
- u32 *next_btf_id)
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
+ enum bpf_type_flag tmp_flag = 0;
int err;
u32 id;
do {
- err = btf_struct_walk(log, btf, t, off, size, &id);
+ err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
switch (err) {
case WALK_PTR:
@@ -5267,6 +5387,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
* we're done.
*/
*next_btf_id = id;
+ *flag = tmp_flag;
return PTR_TO_BTF_ID;
case WALK_SCALAR:
return SCALAR_VALUE;
@@ -5311,6 +5432,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log,
const struct btf *need_btf, u32 need_type_id)
{
const struct btf_type *type;
+ enum bpf_type_flag flag;
int err;
/* Are we already done? */
@@ -5321,7 +5443,7 @@ again:
type = btf_type_by_id(btf, id);
if (!type)
return false;
- err = btf_struct_walk(log, btf, type, off, 1, &id);
+ err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
if (err != WALK_STRUCT)
return false;
@@ -5385,7 +5507,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
}
args = (const struct btf_param *)(func + 1);
nargs = btf_type_vlen(func);
- if (nargs >= MAX_BPF_FUNC_ARGS) {
+ if (nargs > MAX_BPF_FUNC_ARGS) {
bpf_log(log,
"The function %s has %d arguments. Too many.\n",
tname, nargs);
@@ -5616,17 +5738,45 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
return true;
}
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ int len, sfx_len = sizeof("__sz") - 1;
+ const struct btf_type *t;
+ const char *param_name;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len < sfx_len)
+ return false;
+ param_name += len - sfx_len;
+ if (strncmp(param_name, "__sz", sfx_len))
+ return false;
+
+ return true;
+}
+
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok)
{
struct bpf_verifier_log *log = &env->log;
+ u32 i, nargs, ref_id, ref_obj_id = 0;
bool is_kfunc = btf_is_kernel(btf);
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
- u32 i, nargs, ref_id;
+ int ref_regno = 0, ret;
+ bool rel = false;
t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
@@ -5652,6 +5802,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
+ /* Only kfunc can be release func */
+ if (is_kfunc)
+ rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_RELEASE, func_id);
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
@@ -5675,6 +5829,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel);
+ if (ret < 0)
+ return ret;
+
if (btf_get_prog_ctx_type(log, btf, t,
env->prog->type, i)) {
/* If function expects ctx type in BTF check that caller
@@ -5686,8 +5845,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
i, btf_type_str(t));
return -EINVAL;
}
- if (check_ptr_off_reg(env, reg, regno))
- return -EINVAL;
} else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
(reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
const struct btf_type *reg_ref_t;
@@ -5705,6 +5862,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (reg->type == PTR_TO_BTF_ID) {
reg_btf = reg->btf;
reg_ref_id = reg->btf_id;
+ /* Ensure only one argument is referenced
+ * PTR_TO_BTF_ID, check_func_arg_reg_off relies
+ * on only one referenced register being allowed
+ * for kfuncs.
+ */
+ if (reg->ref_obj_id) {
+ if (ref_obj_id) {
+ bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id, ref_obj_id);
+ return -EFAULT;
+ }
+ ref_regno = regno;
+ ref_obj_id = reg->ref_obj_id;
+ }
} else {
reg_btf = btf_vmlinux;
reg_ref_id = *reg2btf_ids[base_type(reg->type)];
@@ -5728,17 +5899,33 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
u32 type_size;
if (is_kfunc) {
+ bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
+
/* Permit pointer to mem, but only when argument
* type is pointer to scalar, or struct composed
* (recursively) of scalars.
+ * When arg_mem_size is true, the pointer can be
+ * void *.
*/
if (!btf_type_is_scalar(ref_t) &&
- !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
+ !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
+ (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
bpf_log(log,
- "arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
- i, btf_type_str(ref_t), ref_tname);
+ "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
return -EINVAL;
}
+
+ /* Check for mem, len pair */
+ if (arg_mem_size) {
+ if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
+ bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n",
+ i, i + 1);
+ return -EINVAL;
+ }
+ i++;
+ continue;
+ }
}
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
@@ -5759,7 +5946,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
}
}
- return 0;
+ /* Either both are set, or neither */
+ WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno));
+ /* We already made sure ref_obj_id is set only for one argument. We do
+ * allow (!rel && ref_obj_id), so that passing such referenced
+ * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when
+ * is_kfunc is true.
+ */
+ if (rel && !ref_obj_id) {
+ bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+ func_name);
+ return -EINVAL;
+ }
+ /* returns argument register number > 0 in case of reference release kfunc */
+ return rel ? ref_regno : 0;
}
/* Compare BTF of a function with given bpf_reg_state.
@@ -6005,7 +6205,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
- /* If we encontered an error, return it. */
+ /* If we encountered an error, return it. */
if (ssnprintf.show.state.status)
return ssnprintf.show.state.status;
@@ -6201,12 +6401,17 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
}
+enum {
+ BTF_MODULE_F_LIVE = (1 << 0),
+};
+
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
struct btf_module {
struct list_head list;
struct module *module;
struct btf *btf;
struct bin_attribute *sysfs_attr;
+ int flags;
};
static LIST_HEAD(btf_modules);
@@ -6234,7 +6439,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
int err = 0;
if (mod->btf_data_size == 0 ||
- (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+ (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE &&
+ op != MODULE_STATE_GOING))
goto out;
switch (op) {
@@ -6249,7 +6455,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
pr_warn("failed to validate module [%s] BTF: %ld\n",
mod->name, PTR_ERR(btf));
kfree(btf_mod);
- err = PTR_ERR(btf);
+ if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ err = PTR_ERR(btf);
goto out;
}
err = btf_alloc_id(btf);
@@ -6293,6 +6500,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
}
break;
+ case MODULE_STATE_LIVE:
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_mod->flags |= BTF_MODULE_F_LIVE;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+ break;
case MODULE_STATE_GOING:
mutex_lock(&btf_module_mutex);
list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
@@ -6339,7 +6557,12 @@ struct module *btf_try_get_module(const struct btf *btf)
if (btf_mod->btf != btf)
continue;
- if (try_module_get(btf_mod->module))
+ /* We must only consider module whose __init routine has
+ * finished, hence we must check for BTF_MODULE_F_LIVE flag,
+ * which is set from the notifier callback for
+ * MODULE_STATE_LIVE.
+ */
+ if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module))
res = btf_mod->module;
break;
@@ -6350,9 +6573,43 @@ struct module *btf_try_get_module(const struct btf *btf)
return res;
}
+/* Returns struct btf corresponding to the struct module.
+ * This function can return NULL or ERR_PTR.
+ */
+static struct btf *btf_get_module_btf(const struct module *module)
+{
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+#endif
+ struct btf *btf = NULL;
+
+ if (!module) {
+ btf = bpf_get_btf_vmlinux();
+ if (!IS_ERR_OR_NULL(btf))
+ btf_get(btf);
+ return btf;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_get(btf_mod->btf);
+ btf = btf_mod->btf;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return btf;
+}
+
BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
{
- struct btf *btf;
+ struct btf *btf = NULL;
+ int btf_obj_fd = 0;
long ret;
if (flags)
@@ -6361,44 +6618,17 @@ BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int
if (name_sz <= 1 || name[name_sz - 1])
return -EINVAL;
- btf = bpf_get_btf_vmlinux();
- if (IS_ERR(btf))
- return PTR_ERR(btf);
-
- ret = btf_find_by_name_kind(btf, name, kind);
- /* ret is never zero, since btf_find_by_name_kind returns
- * positive btf_id or negative error.
- */
- if (ret < 0) {
- struct btf *mod_btf;
- int id;
-
- /* If name is not found in vmlinux's BTF then search in module's BTFs */
- spin_lock_bh(&btf_idr_lock);
- idr_for_each_entry(&btf_idr, mod_btf, id) {
- if (!btf_is_module(mod_btf))
- continue;
- /* linear search could be slow hence unlock/lock
- * the IDR to avoiding holding it for too long
- */
- btf_get(mod_btf);
- spin_unlock_bh(&btf_idr_lock);
- ret = btf_find_by_name_kind(mod_btf, name, kind);
- if (ret > 0) {
- int btf_obj_fd;
-
- btf_obj_fd = __btf_new_fd(mod_btf);
- if (btf_obj_fd < 0) {
- btf_put(mod_btf);
- return btf_obj_fd;
- }
- return ret | (((u64)btf_obj_fd) << 32);
- }
- spin_lock_bh(&btf_idr_lock);
- btf_put(mod_btf);
+ ret = bpf_find_btf_id(name, kind, &btf);
+ if (ret > 0 && btf_is_module(btf)) {
+ btf_obj_fd = __btf_new_fd(btf);
+ if (btf_obj_fd < 0) {
+ btf_put(btf);
+ return btf_obj_fd;
}
- spin_unlock_bh(&btf_idr_lock);
+ return ret | (((u64)btf_obj_fd) << 32);
}
+ if (ret > 0)
+ btf_put(btf);
return ret;
}
@@ -6417,58 +6647,298 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
-/* BTF ID set registration API for modules */
+/* Kernel Function (kfunc) BTF ID set registration API */
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ enum btf_kfunc_type type,
+ struct btf_id_set *add_set, bool vmlinux_set)
+{
+ struct btf_kfunc_set_tab *tab;
+ struct btf_id_set *set;
+ u32 set_cnt;
+ int ret;
-void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
- struct kfunc_btf_id_set *s)
+ if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (!add_set->cnt)
+ return 0;
+
+ tab = btf->kfunc_set_tab;
+ if (!tab) {
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
+ if (!tab)
+ return -ENOMEM;
+ btf->kfunc_set_tab = tab;
+ }
+
+ set = tab->sets[hook][type];
+ /* Warn when register_btf_kfunc_id_set is called twice for the same hook
+ * for module sets.
+ */
+ if (WARN_ON_ONCE(set && !vmlinux_set)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* We don't need to allocate, concatenate, and sort module sets, because
+ * only one is allowed per hook. Hence, we can directly assign the
+ * pointer and return.
+ */
+ if (!vmlinux_set) {
+ tab->sets[hook][type] = add_set;
+ return 0;
+ }
+
+ /* In case of vmlinux sets, there may be more than one set being
+ * registered per hook. To create a unified set, we allocate a new set
+ * and concatenate all individual sets being registered. While each set
+ * is individually sorted, they may become unsorted when concatenated,
+ * hence re-sorting the final set again is required to make binary
+ * searching the set using btf_id_set_contains function work.
+ */
+ set_cnt = set ? set->cnt : 0;
+
+ if (set_cnt > U32_MAX - add_set->cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+
+ if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Grow set */
+ set = krealloc(tab->sets[hook][type],
+ offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!set) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ /* For newly allocated set, initialize set->cnt to 0 */
+ if (!tab->sets[hook][type])
+ set->cnt = 0;
+ tab->sets[hook][type] = set;
+
+ /* Concatenate the two sets */
+ memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0]));
+ set->cnt += add_set->cnt;
+
+ sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL);
+
+ return 0;
+end:
+ btf_free_kfunc_set_tab(btf);
+ return ret;
+}
+
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
{
- mutex_lock(&l->mutex);
- list_add(&s->list, &l->list);
- mutex_unlock(&l->mutex);
+ bool vmlinux_set = !btf_is_module(btf);
+ int type, ret = 0;
+
+ for (type = 0; type < ARRAY_SIZE(kset->sets); type++) {
+ if (!kset->sets[type])
+ continue;
+
+ ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set);
+ if (ret)
+ break;
+ }
+ return ret;
}
-EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
-void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
- struct kfunc_btf_id_set *s)
+static bool __btf_kfunc_id_set_contains(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ enum btf_kfunc_type type,
+ u32 kfunc_btf_id)
{
- mutex_lock(&l->mutex);
- list_del_init(&s->list);
- mutex_unlock(&l->mutex);
+ struct btf_id_set *set;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX)
+ return false;
+ if (!btf->kfunc_set_tab)
+ return false;
+ set = btf->kfunc_set_tab->sets[hook][type];
+ if (!set)
+ return false;
+ return btf_id_set_contains(set, kfunc_btf_id);
}
-EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
-bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
- struct module *owner)
+static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
{
- struct kfunc_btf_id_set *s;
+ switch (prog_type) {
+ case BPF_PROG_TYPE_XDP:
+ return BTF_KFUNC_HOOK_XDP;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ return BTF_KFUNC_HOOK_TC;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return BTF_KFUNC_HOOK_STRUCT_OPS;
+ default:
+ return BTF_KFUNC_HOOK_MAX;
+ }
+}
- mutex_lock(&klist->mutex);
- list_for_each_entry(s, &klist->list, list) {
- if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
- mutex_unlock(&klist->mutex);
- return true;
+/* Caution:
+ * Reference to the module (obtained using btf_try_get_module) corresponding to
+ * the struct btf *MUST* be held when calling this function from verifier
+ * context. This is usually true as we stash references in prog's kfunc_btf_tab;
+ * keeping the reference for the duration of the call provides the necessary
+ * protection for looking up a well-formed btf->kfunc_set_tab.
+ */
+bool btf_kfunc_id_set_contains(const struct btf *btf,
+ enum bpf_prog_type prog_type,
+ enum btf_kfunc_type type, u32 kfunc_btf_id)
+{
+ enum btf_kfunc_hook hook;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id);
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+ const struct btf_kfunc_id_set *kset)
+{
+ enum btf_kfunc_hook hook;
+ struct btf *btf;
+ int ret;
+
+ btf = btf_get_module_btf(kset->owner);
+ if (!btf) {
+ if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register kfuncs\n");
+ return -ENOENT;
+ }
+ if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
+ pr_err("missing module BTF, cannot register kfuncs\n");
+ return -ENOENT;
}
+ return 0;
}
- mutex_unlock(&klist->mutex);
- return false;
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ ret = btf_populate_kfunc_set(btf, hook, kset);
+ btf_put(btf);
+ return ret;
}
+EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+
+#define MAX_TYPES_ARE_COMPAT_DEPTH 2
-#define DEFINE_KFUNC_BTF_ID_LIST(name) \
- struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \
- __MUTEX_INITIALIZER(name.mutex) }; \
- EXPORT_SYMBOL_GPL(name)
+static
+int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+ const struct btf *targ_btf, __u32 targ_id,
+ int level)
+{
+ const struct btf_type *local_type, *targ_type;
+ int depth = 32; /* max recursion depth */
-DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
-DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
+ /* caller made sure that names match (ignoring flavor suffix) */
+ local_type = btf_type_by_id(local_btf, local_id);
+ targ_type = btf_type_by_id(targ_btf, targ_id);
+ if (btf_kind(local_type) != btf_kind(targ_type))
+ return 0;
-#endif
+recur:
+ depth--;
+ if (depth < 0)
+ return -EINVAL;
+
+ local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id);
+ targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id);
+ if (!local_type || !targ_type)
+ return -EINVAL;
+
+ if (btf_kind(local_type) != btf_kind(targ_type))
+ return 0;
+
+ switch (btf_kind(local_type)) {
+ case BTF_KIND_UNKN:
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ case BTF_KIND_ENUM:
+ case BTF_KIND_FWD:
+ return 1;
+ case BTF_KIND_INT:
+ /* just reject deprecated bitfield-like integers; all other
+ * integers are by default compatible between each other
+ */
+ return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0;
+ case BTF_KIND_PTR:
+ local_id = local_type->type;
+ targ_id = targ_type->type;
+ goto recur;
+ case BTF_KIND_ARRAY:
+ local_id = btf_array(local_type)->type;
+ targ_id = btf_array(targ_type)->type;
+ goto recur;
+ case BTF_KIND_FUNC_PROTO: {
+ struct btf_param *local_p = btf_params(local_type);
+ struct btf_param *targ_p = btf_params(targ_type);
+ __u16 local_vlen = btf_vlen(local_type);
+ __u16 targ_vlen = btf_vlen(targ_type);
+ int i, err;
+
+ if (local_vlen != targ_vlen)
+ return 0;
+
+ for (i = 0; i < local_vlen; i++, local_p++, targ_p++) {
+ if (level <= 0)
+ return -EINVAL;
+ btf_type_skip_modifiers(local_btf, local_p->type, &local_id);
+ btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id);
+ err = __bpf_core_types_are_compat(local_btf, local_id,
+ targ_btf, targ_id,
+ level - 1);
+ if (err <= 0)
+ return err;
+ }
+
+ /* tail recurse for return type check */
+ btf_type_skip_modifiers(local_btf, local_type->type, &local_id);
+ btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id);
+ goto recur;
+ }
+ default:
+ return 0;
+ }
+}
+
+/* Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ * kind should match for local and target types (i.e., STRUCT is not
+ * compatible with UNION);
+ * - for ENUMs, the size is ignored;
+ * - for INT, size and signedness are ignored;
+ * - for ARRAY, dimensionality is ignored, element types are checked for
+ * compatibility recursively;
+ * - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ * - FUNC_PROTOs are compatible if they have compatible signature: same
+ * number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
const struct btf *targ_btf, __u32 targ_id)
{
- return -EOPNOTSUPP;
+ return __bpf_core_types_are_compat(local_btf, local_id,
+ targ_btf, targ_id,
+ MAX_TYPES_ARE_COMPAT_DEPTH);
}
static bool bpf_core_is_flavor_sep(const char *s)
@@ -6711,6 +7181,8 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
main_btf = bpf_get_btf_vmlinux();
if (IS_ERR(main_btf))
return ERR_CAST(main_btf);
+ if (!main_btf)
+ return ERR_PTR(-EINVAL);
local_type = btf_type_by_id(local_btf, local_type_id);
if (!local_type)
@@ -6789,6 +7261,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
{
bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
struct bpf_core_cand_list cands = {};
+ struct bpf_core_relo_res targ_res;
struct bpf_core_spec *specs;
int err;
@@ -6828,13 +7301,19 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
cands.len = cc->cnt;
/* cand_cache_mutex needs to span the cache lookup and
* copy of btf pointer into bpf_core_cand_list,
- * since module can be unloaded while bpf_core_apply_relo_insn
+ * since module can be unloaded while bpf_core_calc_relo_insn
* is working with module's btf.
*/
}
- err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
- relo, relo_idx, ctx->btf, &cands, specs);
+ err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs,
+ &targ_res);
+ if (err)
+ goto out;
+
+ err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx,
+ &targ_res);
+
out:
kfree(specs);
if (need_cands) {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 514b4681a90a..128028efda64 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1031,7 +1031,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* If no socket is passed, or the socket is not of type INET or INET6,
* this function does nothing and returns 0.
@@ -1044,7 +1044,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
* NET_XMIT_CN (2) - continue with packet output and notify TCP
* to call cwr
- * -EPERM - drop packet
+ * -err - drop packet
*
* For ingress packets, this function will return -EPERM if any
* attached program was found and if it returned != 1 during execution.
@@ -1079,8 +1079,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
} else {
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
- __bpf_prog_run_save_cb);
- ret = (ret == 1 ? 0 : -EPERM);
+ __bpf_prog_run_save_cb, 0);
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
}
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
@@ -1093,7 +1094,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
/**
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
* @sk: sock structure to manipulate
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* socket is passed is expected to be of type INET or INET6.
*
@@ -1107,10 +1108,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
- return ret == 1 ? 0 : -EPERM;
+ return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
+ bpf_prog_run, 0);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1119,7 +1119,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* provided by user sockaddr
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
* @t_ctx: Pointer to attach type specific context
* @flags: Pointer to u32 which contains higher bits of BPF program
* return value (OR'ed together).
@@ -1142,7 +1142,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
};
struct sockaddr_storage unspec;
struct cgroup *cgrp;
- int ret;
/* Check socket family since not all sockets represent network
* endpoint (e.g. AF_UNIX).
@@ -1156,10 +1155,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run, flags);
-
- return ret == 1 ? 0 : -EPERM;
+ return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, 0, flags);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
@@ -1169,7 +1166,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
* sk with connection information (IP addresses, etc.) May not contain
* cgroup info if it is a req sock.
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* socket passed is expected to be of type INET or INET6.
*
@@ -1184,11 +1181,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
- bpf_prog_run);
- return ret == 1 ? 0 : -EPERM;
+ return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
+ bpf_prog_run, 0);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
@@ -1201,17 +1196,47 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
.major = major,
.minor = minor,
};
- int allow;
+ int ret;
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, 0);
rcu_read_unlock();
- return !allow;
+ return ret;
+}
+
+BPF_CALL_0(bpf_get_retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ return ctx->retval;
+}
+
+static const struct bpf_func_proto bpf_get_retval_proto = {
+ .func = bpf_get_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_set_retval, int, retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ ctx->retval = retval;
+ return 0;
}
+static const struct bpf_func_proto bpf_set_retval_proto = {
+ .func = bpf_set_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1224,6 +1249,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_retval:
+ return &bpf_get_retval_proto;
+ case BPF_FUNC_set_retval:
+ return &bpf_set_retval_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -1337,7 +1366,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, 0);
rcu_read_unlock();
kfree(ctx.cur_val);
@@ -1350,24 +1380,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.new_val);
}
- return ret == 1 ? 0 : -EPERM;
+ return ret;
}
#ifdef CONFIG_NET
-static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
- enum cgroup_bpf_attach_type attach_type)
-{
- struct bpf_prog_array *prog_array;
- bool empty;
-
- rcu_read_lock();
- prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
- empty = bpf_prog_array_is_empty(prog_array);
- rcu_read_unlock();
-
- return empty;
-}
-
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
struct bpf_sockopt_buf *buf)
{
@@ -1426,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
};
int ret, max_optlen;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
- return 0;
-
/* Allocate a bit more than the initial user buffer for
* BPF program. The canonical use case is overriding
* TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
*/
max_optlen = max_t(int, 16, *optlen);
-
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1452,13 +1460,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
lock_sock(sk);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
- &ctx, bpf_prog_run);
+ &ctx, bpf_prog_run, 0);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
+ if (ret)
goto out;
- }
if (ctx.optlen == -1) {
/* optlen set to -1, bypass kernel */
@@ -1518,19 +1524,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
+ .current_task = current,
};
int ret;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
- return retval;
-
ctx.optlen = max_optlen;
-
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1562,27 +1560,17 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
lock_sock(sk);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run);
+ &ctx, bpf_prog_run, retval);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
+ if (ret < 0)
goto out;
- }
if (ctx.optlen > max_optlen || ctx.optlen < 0) {
ret = -EFAULT;
goto out;
}
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval) {
- ret = -EFAULT;
- goto out;
- }
-
if (ctx.optlen != 0) {
if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
put_user(ctx.optlen, optlen)) {
@@ -1591,8 +1579,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}
}
- ret = ctx.retval;
-
out:
sockopt_free_buf(&ctx, &buf);
return ret;
@@ -1607,10 +1593,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
.optlen = *optlen,
.optval = optval,
.optval_end = optval + *optlen,
+ .current_task = current,
};
int ret;
@@ -1623,25 +1609,19 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
*/
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run);
- if (!ret)
- return -EPERM;
+ &ctx, bpf_prog_run, retval);
+ if (ret < 0)
+ return ret;
if (ctx.optlen > *optlen)
return -EFAULT;
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval)
- return -EFAULT;
-
/* BPF programs can shrink the buffer, export the modifications.
*/
if (ctx.optlen != 0)
*optlen = ctx.optlen;
- return ctx.retval;
+ return ret;
}
#endif
@@ -2057,10 +2037,39 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
break;
case offsetof(struct bpf_sockopt, retval):
- if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
- else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
+
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ treg, treg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ treg, si->src_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ }
break;
case offsetof(struct bpf_sockopt, optval):
*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index de3e5bc6781f..13e9dbeeedf3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -33,6 +33,7 @@
#include <linux/extable.h>
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
+#include <linux/nodemask.h>
#include <asm/barrier.h>
#include <asm/unaligned.h>
@@ -105,6 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
+ fp->blinding_requested = bpf_jit_blinding_enabled(fp);
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
mutex_init(&fp->aux->used_maps_mutex);
@@ -537,13 +539,10 @@ long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
- const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
- unsigned long addr = (unsigned long)hdr;
-
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
- prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE;
+ prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
}
static void
@@ -808,6 +807,176 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_CHUNK_SHIFT 6
+#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
+
+struct bpf_prog_pack {
+ struct list_head list;
+ void *ptr;
+ unsigned long bitmap[];
+};
+
+#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static size_t bpf_prog_pack_size = -1;
+static size_t bpf_prog_pack_mask = -1;
+
+static int bpf_prog_chunk_count(void)
+{
+ WARN_ON_ONCE(bpf_prog_pack_size == -1);
+ return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE;
+}
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
+ * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
+ */
+#ifdef PMD_SIZE
+#define BPF_HPAGE_SIZE PMD_SIZE
+#define BPF_HPAGE_MASK PMD_MASK
+#else
+#define BPF_HPAGE_SIZE PAGE_SIZE
+#define BPF_HPAGE_MASK PAGE_MASK
+#endif
+
+static size_t select_bpf_prog_pack_size(void)
+{
+ size_t size;
+ void *ptr;
+
+ size = BPF_HPAGE_SIZE * num_online_nodes();
+ ptr = module_alloc(size);
+
+ /* Test whether we can get huge pages. If not just use PAGE_SIZE
+ * packs.
+ */
+ if (!ptr || !is_vm_area_hugepages(ptr)) {
+ size = PAGE_SIZE;
+ bpf_prog_pack_mask = PAGE_MASK;
+ } else {
+ bpf_prog_pack_mask = BPF_HPAGE_MASK;
+ }
+
+ vfree(ptr);
+ return size;
+}
+
+static struct bpf_prog_pack *alloc_new_pack(void)
+{
+ struct bpf_prog_pack *pack;
+
+ pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())),
+ GFP_KERNEL);
+ if (!pack)
+ return NULL;
+ pack->ptr = module_alloc(bpf_prog_pack_size);
+ if (!pack->ptr) {
+ kfree(pack);
+ return NULL;
+ }
+ bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
+ list_add_tail(&pack->list, &pack_list);
+
+ set_vm_flush_reset_perms(pack->ptr);
+ set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
+ set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
+ return pack;
+}
+
+static void *bpf_prog_pack_alloc(u32 size)
+{
+ unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ struct bpf_prog_pack *pack;
+ unsigned long pos;
+ void *ptr = NULL;
+
+ mutex_lock(&pack_mutex);
+ if (bpf_prog_pack_size == -1)
+ bpf_prog_pack_size = select_bpf_prog_pack_size();
+
+ if (size > bpf_prog_pack_size) {
+ size = round_up(size, PAGE_SIZE);
+ ptr = module_alloc(size);
+ if (ptr) {
+ set_vm_flush_reset_perms(ptr);
+ set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
+ set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
+ }
+ goto out;
+ }
+ list_for_each_entry(pack, &pack_list, list) {
+ pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
+ nbits, 0);
+ if (pos < bpf_prog_chunk_count())
+ goto found_free_area;
+ }
+
+ pack = alloc_new_pack();
+ if (!pack)
+ goto out;
+
+ pos = 0;
+
+found_free_area:
+ bitmap_set(pack->bitmap, pos, nbits);
+ ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+ mutex_unlock(&pack_mutex);
+ return ptr;
+}
+
+static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+{
+ struct bpf_prog_pack *pack = NULL, *tmp;
+ unsigned int nbits;
+ unsigned long pos;
+ void *pack_ptr;
+
+ mutex_lock(&pack_mutex);
+ if (hdr->size > bpf_prog_pack_size) {
+ module_memfree(hdr);
+ goto out;
+ }
+
+ pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask);
+
+ list_for_each_entry(tmp, &pack_list, list) {
+ if (tmp->ptr == pack_ptr) {
+ pack = tmp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+ goto out;
+
+ nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
+ pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+ bitmap_clear(pack->bitmap, pos, nbits);
+ if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
+ bpf_prog_chunk_count(), 0) == 0) {
+ list_del(&pack->list);
+ module_memfree(pack->ptr);
+ kfree(pack);
+ }
+out:
+ mutex_unlock(&pack_mutex);
+}
+
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -833,12 +1002,11 @@ static int __init bpf_jit_charge_init(void)
}
pure_initcall(bpf_jit_charge_init);
-int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
{
- if (atomic_long_add_return(pages, &bpf_jit_current) >
- (bpf_jit_limit >> PAGE_SHIFT)) {
+ if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
if (!bpf_capable()) {
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
return -EPERM;
}
}
@@ -846,9 +1014,9 @@ int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
{
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
}
void *__weak bpf_jit_alloc_exec(unsigned long size)
@@ -867,7 +1035,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- u32 size, hole, start, pages;
+ u32 size, hole, start;
WARN_ON_ONCE(!is_power_of_2(alignment) ||
alignment > BPF_IMAGE_ALIGNMENT);
@@ -877,20 +1045,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
- pages = size / PAGE_SIZE;
- if (bpf_jit_charge_modmem(pages))
+ if (bpf_jit_charge_modmem(size))
return NULL;
hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
return NULL;
}
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = pages;
+ hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
@@ -903,10 +1070,117 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- u32 pages = hdr->pages;
+ u32 size = hdr->size;
bpf_jit_free_exec(hdr);
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
+}
+
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated memory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ struct bpf_binary_header **rw_header,
+ u8 **rw_image,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *ro_header;
+ u32 size, hole, start;
+
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
+
+ /* add 16 bytes for a random section of illegal instructions */
+ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+ if (bpf_jit_charge_modmem(size))
+ return NULL;
+ ro_header = bpf_prog_pack_alloc(size);
+ if (!ro_header) {
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ *rw_header = kvmalloc(size, GFP_KERNEL);
+ if (!*rw_header) {
+ bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
+ bpf_prog_pack_free(ro_header);
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(*rw_header, size);
+ (*rw_header)->size = size;
+
+ hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+ BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+ start = (get_random_int() % hole) & ~(alignment - 1);
+
+ *image_ptr = &ro_header->image[start];
+ *rw_image = &(*rw_header)->image[start];
+
+ return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+ struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ void *ptr;
+
+ ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+ kvfree(rw_header);
+
+ if (IS_ERR(ptr)) {
+ bpf_prog_pack_free(ro_header);
+ return PTR_ERR(ptr);
+ }
+ prog->aux->use_bpf_prog_pack = true;
+ return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ * 1) when the program is freed after;
+ * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ *
+ * bpf_jit_binary_pack_free requires proper ro_header->size. However,
+ * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
+ * must be set with either bpf_jit_binary_pack_finalize (normal path) or
+ * bpf_arch_text_copy (when jit fails).
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ u32 size = ro_header->size;
+
+ bpf_prog_pack_free(ro_header);
+ kvfree(rw_header);
+ bpf_jit_uncharge_modmem(size);
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ if (fp->aux->use_bpf_prog_pack)
+ addr = real_start & BPF_PROG_CHUNK_MASK;
+ else
+ addr = real_start & PAGE_MASK;
+
+ return (void *)addr;
}
/* This symbol is only overridden by archs that have different
@@ -918,7 +1192,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
- bpf_jit_binary_free(hdr);
+ if (fp->aux->use_bpf_prog_pack)
+ bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
+ else
+ bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
}
@@ -1146,7 +1423,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
struct bpf_insn *insn;
int i, rewritten;
- if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
+ if (!prog->blinding_requested || prog->blinded)
return prog;
clone = bpf_prog_clone_create(prog, GFP_USER);
@@ -1829,28 +2106,30 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
}
#endif
-bool bpf_prog_array_compatible(struct bpf_array *array,
- const struct bpf_prog *fp)
+bool bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
{
bool ret;
if (fp->kprobe_override)
return false;
- spin_lock(&array->aux->owner.lock);
-
- if (!array->aux->owner.type) {
+ spin_lock(&map->owner.lock);
+ if (!map->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- array->aux->owner.type = fp->type;
- array->aux->owner.jited = fp->jited;
+ map->owner.type = fp->type;
+ map->owner.jited = fp->jited;
+ map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
ret = true;
} else {
- ret = array->aux->owner.type == fp->type &&
- array->aux->owner.jited == fp->jited;
+ ret = map->owner.type == fp->type &&
+ map->owner.jited == fp->jited &&
+ map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
}
- spin_unlock(&array->aux->owner.lock);
+ spin_unlock(&map->owner.lock);
+
return ret;
}
@@ -1862,13 +2141,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
mutex_lock(&aux->used_maps_mutex);
for (i = 0; i < aux->used_map_cnt; i++) {
struct bpf_map *map = aux->used_maps[i];
- struct bpf_array *array;
- if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ if (!map_type_contains_progs(map))
continue;
- array = container_of(map, struct bpf_array, map);
- if (!bpf_prog_array_compatible(array, fp)) {
+ if (!bpf_prog_map_compatible(map, fp)) {
ret = -EINVAL;
goto out;
}
@@ -1968,18 +2245,10 @@ static struct bpf_prog_dummy {
},
};
-/* to avoid allocating empty bpf_prog_array for cgroups that
- * don't have bpf program attached use one global 'empty_prog_array'
- * It will not be modified the caller of bpf_prog_array_alloc()
- * (since caller requested prog_cnt == 0)
- * that pointer should be 'freed' by bpf_prog_array_free()
- */
-static struct {
- struct bpf_prog_array hdr;
- struct bpf_prog *null_prog;
-} empty_prog_array = {
+struct bpf_empty_prog_array bpf_empty_prog_array = {
.null_prog = NULL,
};
+EXPORT_SYMBOL(bpf_empty_prog_array);
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
@@ -1989,12 +2258,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
(prog_cnt + 1),
flags);
- return &empty_prog_array.hdr;
+ return &bpf_empty_prog_array.hdr;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs || progs == &empty_prog_array.hdr)
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
return;
kfree_rcu(progs, rcu);
}
@@ -2453,6 +2722,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
return -ENOTSUPP;
}
+void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b3e6b9422238..650e5d21f90d 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -397,7 +397,8 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
-static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu,
+ struct bpf_map *map, int fd)
{
struct bpf_prog *prog;
@@ -405,7 +406,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+ if (prog->expected_attach_type != BPF_XDP_CPUMAP ||
+ !bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return -EINVAL;
}
@@ -457,7 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu->map_id = map->id;
rcpu->value.qsize = value->qsize;
- if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+ if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
goto free_ptr_ring;
/* Setup kthread */
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index fe019dbdb3f0..038f6d7a83e4 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -858,7 +858,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
BPF_PROG_TYPE_XDP, false);
if (IS_ERR(prog))
goto err_put_dev;
- if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
+ !bpf_prog_map_compatible(&dtab->map, prog))
goto err_put_prog;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d29af9988f37..65877967f414 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1636,7 +1636,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
value_size = size * num_possible_cpus();
total = 0;
/* while experimenting with hash tables with sizes ranging from 10 to
- * 1000, it was observed that a bucket can have upto 5 entries.
+ * 1000, it was observed that a bucket can have up to 5 entries.
*/
bucket_size = 5;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 55c084251fab..315053ef6a75 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -17,6 +17,7 @@
#include <linux/pid_namespace.h>
#include <linux/proc_ns.h>
#include <linux/security.h>
+#include <linux/btf_ids.h>
#include "../../lib/kstrtox.h"
@@ -224,13 +225,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
if (unlikely(!task))
goto err_clear;
- strncpy(buf, task->comm, size);
-
- /* Verifier guarantees that size > 0. For task->comm exceeding
- * size, guarantee that buf is %NUL-terminated. Unconditionally
- * done here to save the size test.
- */
- buf[size - 1] = 0;
+ /* Verifier guarantees that size > 0 */
+ strscpy(buf, task->comm, size);
return 0;
err_clear:
memset(buf, 0, size);
@@ -672,6 +668,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
+ const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
+{
+ int ret;
+
+ /* flags is not used yet */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(!size))
+ return 0;
+
+ ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
+ if (ret == size)
+ return 0;
+
+ memset(dst, 0, size);
+ /* Return -EFAULT for partial read */
+ return ret < 0 ? ret : -EFAULT;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_task_proto = {
+ .func = bpf_copy_from_user_task,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_BTF_ID,
+ .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg5_type = ARG_ANYTHING
+};
+
BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
{
if (cpu >= nr_cpu_ids)
@@ -1059,7 +1088,7 @@ struct bpf_hrtimer {
struct bpf_timer_kern {
struct bpf_hrtimer *timer;
/* bpf_spin_lock is used here instead of spinlock_t to make
- * sure that it always fits into space resereved by struct bpf_timer
+ * sure that it always fits into space reserved by struct bpf_timer
* regardless of LOCKDEP and spinlock debug flags.
*/
struct bpf_spin_lock lock;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8d9f7467bf..4f841e16779e 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -710,11 +710,10 @@ static DEFINE_MUTEX(bpf_preload_lock);
static int populate_bpffs(struct dentry *parent)
{
struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
- struct bpf_link *links[BPF_PRELOAD_LINKS] = {};
int err = 0, i;
/* grab the mutex to make sure the kernel interactions with bpf_preload
- * UMD are serialized
+ * are serialized
*/
mutex_lock(&bpf_preload_lock);
@@ -722,40 +721,22 @@ static int populate_bpffs(struct dentry *parent)
if (!bpf_preload_mod_get())
goto out;
- if (!bpf_preload_ops->info.tgid) {
- /* preload() will start UMD that will load BPF iterator programs */
- err = bpf_preload_ops->preload(objs);
- if (err)
+ err = bpf_preload_ops->preload(objs);
+ if (err)
+ goto out_put;
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+ bpf_link_inc(objs[i].link);
+ err = bpf_iter_link_pin_kernel(parent,
+ objs[i].link_name, objs[i].link);
+ if (err) {
+ bpf_link_put(objs[i].link);
goto out_put;
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- links[i] = bpf_link_by_id(objs[i].link_id);
- if (IS_ERR(links[i])) {
- err = PTR_ERR(links[i]);
- goto out_put;
- }
}
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- err = bpf_iter_link_pin_kernel(parent,
- objs[i].link_name, links[i]);
- if (err)
- goto out_put;
- /* do not unlink successfully pinned links even
- * if later link fails to pin
- */
- links[i] = NULL;
- }
- /* finish() will tell UMD process to exit */
- err = bpf_preload_ops->finish();
- if (err)
- goto out_put;
}
out_put:
bpf_preload_mod_put();
out:
mutex_unlock(&bpf_preload_lock);
- for (i = 0; i < BPF_PRELOAD_LINKS && err; i++)
- if (!IS_ERR_OR_NULL(links[i]))
- bpf_link_put(links[i]);
return err;
}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 23f7f9d08a62..497916060ac7 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf-cgroup.h>
#include <linux/bpf.h>
#include <linux/bpf_local_storage.h>
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
index 26bced262473..c9d45c9d6918 100644
--- a/kernel/bpf/preload/Kconfig
+++ b/kernel/bpf/preload/Kconfig
@@ -18,10 +18,9 @@ menuconfig BPF_PRELOAD
if BPF_PRELOAD
config BPF_PRELOAD_UMD
- tristate "bpf_preload kernel module with user mode driver"
- depends on CC_CAN_LINK
- depends on m || CC_CAN_LINK_STATIC
+ tristate "bpf_preload kernel module"
default m
help
- This builds bpf_preload kernel module with embedded user mode driver.
+ This builds bpf_preload kernel module with embedded BPF programs for
+ introspection in bpffs.
endif
diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
index 1400ac58178e..20f89cc0a0a6 100644
--- a/kernel/bpf/preload/Makefile
+++ b/kernel/bpf/preload/Makefile
@@ -1,42 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
-LIBBPF_OUT = $(abspath $(obj))/libbpf
-LIBBPF_A = $(LIBBPF_OUT)/libbpf.a
-LIBBPF_DESTDIR = $(LIBBPF_OUT)
-LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
-
-# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test
-# in tools/scripts/Makefile.include always succeeds when building the kernel
-# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg".
-$(LIBBPF_A): | $(LIBBPF_OUT)
- $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \
- DESTDIR=$(LIBBPF_DESTDIR) prefix= \
- $(LIBBPF_OUT)/libbpf.a install_headers
-
-libbpf_hdrs: $(LIBBPF_A)
-
-.PHONY: libbpf_hdrs
-
-$(LIBBPF_OUT):
- $(call msg,MKDIR,$@)
- $(Q)mkdir -p $@
-
-userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
- -I $(LIBBPF_INCLUDE) -Wno-unused-result
-
-userprogs := bpf_preload_umd
-
-clean-files := libbpf/
-
-$(obj)/iterators/iterators.o: | libbpf_hdrs
-
-bpf_preload_umd-objs := iterators/iterators.o
-bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz
-
-$(obj)/bpf_preload_umd: $(LIBBPF_A)
-
-$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd
+LIBBPF_INCLUDE = $(srctree)/tools/lib
obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o
-bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o
+CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE)
+bpf_preload-objs += bpf_preload_kern.o
diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h
index 2f9932276f2e..f065c91213a0 100644
--- a/kernel/bpf/preload/bpf_preload.h
+++ b/kernel/bpf/preload/bpf_preload.h
@@ -2,13 +2,13 @@
#ifndef _BPF_PRELOAD_H
#define _BPF_PRELOAD_H
-#include <linux/usermode_driver.h>
-#include "iterators/bpf_preload_common.h"
+struct bpf_preload_info {
+ char link_name[16];
+ struct bpf_link *link;
+};
struct bpf_preload_ops {
- struct umd_info info;
int (*preload)(struct bpf_preload_info *);
- int (*finish)(void);
struct module *owner;
};
extern struct bpf_preload_ops *bpf_preload_ops;
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 53736e52c1df..5106b5372f0c 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -2,101 +2,87 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/pid.h>
-#include <linux/fs.h>
-#include <linux/sched/signal.h>
#include "bpf_preload.h"
+#include "iterators/iterators.lskel.h"
-extern char bpf_preload_umd_start;
-extern char bpf_preload_umd_end;
+static struct bpf_link *maps_link, *progs_link;
+static struct iterators_bpf *skel;
-static int preload(struct bpf_preload_info *obj);
-static int finish(void);
+static void free_links_and_skel(void)
+{
+ if (!IS_ERR_OR_NULL(maps_link))
+ bpf_link_put(maps_link);
+ if (!IS_ERR_OR_NULL(progs_link))
+ bpf_link_put(progs_link);
+ iterators_bpf__destroy(skel);
+}
+
+static int preload(struct bpf_preload_info *obj)
+{
+ strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
+ obj[0].link = maps_link;
+ strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
+ obj[1].link = progs_link;
+ return 0;
+}
-static struct bpf_preload_ops umd_ops = {
- .info.driver_name = "bpf_preload",
+static struct bpf_preload_ops ops = {
.preload = preload,
- .finish = finish,
.owner = THIS_MODULE,
};
-static int preload(struct bpf_preload_info *obj)
+static int load_skel(void)
{
- int magic = BPF_PRELOAD_START;
- loff_t pos = 0;
- int i, err;
- ssize_t n;
+ int err;
- err = fork_usermode_driver(&umd_ops.info);
+ skel = iterators_bpf__open();
+ if (!skel)
+ return -ENOMEM;
+ err = iterators_bpf__load(skel);
if (err)
- return err;
-
- /* send the start magic to let UMD proceed with loading BPF progs */
- n = kernel_write(umd_ops.info.pipe_to_umh,
- &magic, sizeof(magic), &pos);
- if (n != sizeof(magic))
- return -EPIPE;
-
- /* receive bpf_link IDs and names from UMD */
- pos = 0;
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- n = kernel_read(umd_ops.info.pipe_from_umh,
- &obj[i], sizeof(*obj), &pos);
- if (n != sizeof(*obj))
- return -EPIPE;
+ goto out;
+ err = iterators_bpf__attach(skel);
+ if (err)
+ goto out;
+ maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd);
+ if (IS_ERR(maps_link)) {
+ err = PTR_ERR(maps_link);
+ goto out;
}
- return 0;
-}
-
-static int finish(void)
-{
- int magic = BPF_PRELOAD_END;
- struct pid *tgid;
- loff_t pos = 0;
- ssize_t n;
-
- /* send the last magic to UMD. It will do a normal exit. */
- n = kernel_write(umd_ops.info.pipe_to_umh,
- &magic, sizeof(magic), &pos);
- if (n != sizeof(magic))
- return -EPIPE;
-
- tgid = umd_ops.info.tgid;
- if (tgid) {
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_cleanup_helper(&umd_ops.info);
+ progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd);
+ if (IS_ERR(progs_link)) {
+ err = PTR_ERR(progs_link);
+ goto out;
}
+ /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out
+ * makes skel_closenz() a no-op later in iterators_bpf__destroy().
+ */
+ close_fd(skel->links.dump_bpf_map_fd);
+ skel->links.dump_bpf_map_fd = 0;
+ close_fd(skel->links.dump_bpf_prog_fd);
+ skel->links.dump_bpf_prog_fd = 0;
return 0;
+out:
+ free_links_and_skel();
+ return err;
}
-static int __init load_umd(void)
+static int __init load(void)
{
int err;
- err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start,
- &bpf_preload_umd_end - &bpf_preload_umd_start);
+ err = load_skel();
if (err)
return err;
- bpf_preload_ops = &umd_ops;
+ bpf_preload_ops = &ops;
return err;
}
-static void __exit fini_umd(void)
+static void __exit fini(void)
{
- struct pid *tgid;
-
bpf_preload_ops = NULL;
-
- /* kill UMD in case it's still there due to earlier error */
- tgid = umd_ops.info.tgid;
- if (tgid) {
- kill_pid(tgid, SIGKILL, 1);
-
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_cleanup_helper(&umd_ops.info);
- }
- umd_unload_blob(&umd_ops.info);
+ free_links_and_skel();
}
-late_initcall(load_umd);
-module_exit(fini_umd);
+late_initcall(load);
+module_exit(fini);
MODULE_LICENSE("GPL");
diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S
deleted file mode 100644
index f1f40223b5c3..000000000000
--- a/kernel/bpf/preload/bpf_preload_umd_blob.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
- .section .init.rodata, "a"
- .global bpf_preload_umd_start
-bpf_preload_umd_start:
- .incbin "kernel/bpf/preload/bpf_preload_umd"
- .global bpf_preload_umd_end
-bpf_preload_umd_end:
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index b8bd60511227..bfe24f8c5a20 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -35,15 +35,15 @@ endif
.PHONY: all clean
-all: iterators.skel.h
+all: iterators.lskel.h
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) iterators
-iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
+iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
$(call msg,GEN-SKEL,$@)
- $(Q)$(BPFTOOL) gen skeleton $< > $@
+ $(Q)$(BPFTOOL) gen skeleton -L $< > $@
$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h
deleted file mode 100644
index 8464d1a48c05..000000000000
--- a/kernel/bpf/preload/iterators/bpf_preload_common.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BPF_PRELOAD_COMMON_H
-#define _BPF_PRELOAD_COMMON_H
-
-#define BPF_PRELOAD_START 0x5555
-#define BPF_PRELOAD_END 0xAAAA
-
-struct bpf_preload_info {
- char link_name[16];
- int link_id;
-};
-
-#endif
diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
deleted file mode 100644
index 5d872a705470..000000000000
--- a/kernel/bpf/preload/iterators/iterators.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2020 Facebook */
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/resource.h>
-#include <bpf/libbpf.h>
-#include <bpf/bpf.h>
-#include <sys/mount.h>
-#include "iterators.skel.h"
-#include "bpf_preload_common.h"
-
-int to_kernel = -1;
-int from_kernel = 0;
-
-static int send_link_to_kernel(struct bpf_link *link, const char *link_name)
-{
- struct bpf_preload_info obj = {};
- struct bpf_link_info info = {};
- __u32 info_len = sizeof(info);
- int err;
-
- err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
- if (err)
- return err;
- obj.link_id = info.id;
- if (strlen(link_name) >= sizeof(obj.link_name))
- return -E2BIG;
- strcpy(obj.link_name, link_name);
- if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj))
- return -EPIPE;
- return 0;
-}
-
-int main(int argc, char **argv)
-{
- struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY };
- struct iterators_bpf *skel;
- int err, magic;
- int debug_fd;
-
- debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC);
- if (debug_fd < 0)
- return 1;
- to_kernel = dup(1);
- close(1);
- dup(debug_fd);
- /* now stdin and stderr point to /dev/console */
-
- read(from_kernel, &magic, sizeof(magic));
- if (magic != BPF_PRELOAD_START) {
- printf("bad start magic %d\n", magic);
- return 1;
- }
- setrlimit(RLIMIT_MEMLOCK, &rlim);
- /* libbpf opens BPF object and loads it into the kernel */
- skel = iterators_bpf__open_and_load();
- if (!skel) {
- /* iterators.skel.h is little endian.
- * libbpf doesn't support automatic little->big conversion
- * of BPF bytecode yet.
- * The program load will fail in such case.
- */
- printf("Failed load could be due to wrong endianness\n");
- return 1;
- }
- err = iterators_bpf__attach(skel);
- if (err)
- goto cleanup;
-
- /* send two bpf_link IDs with names to the kernel */
- err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug");
- if (err)
- goto cleanup;
- err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug");
- if (err)
- goto cleanup;
-
- /* The kernel will proceed with pinnging the links in bpffs.
- * UMD will wait on read from pipe.
- */
- read(from_kernel, &magic, sizeof(magic));
- if (magic != BPF_PRELOAD_END) {
- printf("bad final magic %d\n", magic);
- err = -EINVAL;
- }
-cleanup:
- iterators_bpf__destroy(skel);
-
- return err != 0;
-}
diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h
new file mode 100644
index 000000000000..70f236a82fe1
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel.h
@@ -0,0 +1,425 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+ struct iterators_bpf__rodata {
+ } *rodata;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ skel->rodata = skel_prep_map_data((void *)"\
+\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\
+\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\
+\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98);
+ if (!skel->rodata)
+ goto cleanup;
+ skel->maps.rodata.initial_value = (__u64) (long) skel->rodata;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6056;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
+\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
+\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
+\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\
+\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\
+\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\
+\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\
+\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\
+\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\
+\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\
+\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\
+\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\
+\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\
+\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\
+\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\
+\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\
+\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\
+\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\
+\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\
+\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\
+\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\
+\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\
+\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\
+\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\
+\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\
+\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\
+\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\
+\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\
+\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\
+\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\
+\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\
+\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\
+\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\
+\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\
+\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\
+\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\
+\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\
+\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
+\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\
+\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\
+\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
+\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
+\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
+\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
+\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
+\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
+\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
+\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
+\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
+\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
+\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
+\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
+\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
+\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
+\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
+\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
+\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
+\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
+\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
+\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
+\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
+\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
+\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
+\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
+\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
+\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
+\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
+\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
+\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
+\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
+\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
+\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\
+\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\
+\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\
+\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\
+\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\
+\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\
+\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\
+\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\
+\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\
+\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\
+\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\
+\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\
+\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\
+\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\
+\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\
+\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\
+\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\
+\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\
+\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\
+\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\
+\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\
+\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\
+\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\
+\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\
+\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\
+\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\
+\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\
+\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\
+\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\
+\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\
+\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\
+\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\
+\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\
+\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\
+\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\
+\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\
+\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\
+\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\
+\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\
+\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\
+\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\
+\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\
+\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\
+\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\
+\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\
+\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\
+\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\
+\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\
+\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\
+\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\
+\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\
+\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\
+\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\
+\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\
+\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\
+\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\
+\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\
+\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\
+\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\
+\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\
+\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\
+\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\
+\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\
+\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\
+\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\
+\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\
+\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\
+\0\0\0\0";
+ opts.insns_sz = 2216;
+ opts.insns = (void *)"\
+\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
+\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
+\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\
+\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\
+\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\
+\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\
+\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
+\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\
+\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
+\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\
+\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
+\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
+\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
+\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
+\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\
+\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\
+\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\
+\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\
+\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\
+\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\
+\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\
+\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\
+\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\
+\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\
+\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\
+\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\
+\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\
+\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\
+\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\
+\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\
+\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\
+\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\
+\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\
+\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\
+\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\
+\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\
+\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\
+\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\
+\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\
+\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value,
+ 4096, PROT_READ, skel->maps.rodata.map_fd);
+ if (!skel->rodata)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h
deleted file mode 100644
index cf9a6a94b3a4..000000000000
--- a/kernel/bpf/preload/iterators/iterators.skel.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
-
-/* THIS FILE IS AUTOGENERATED! */
-#ifndef __ITERATORS_BPF_SKEL_H__
-#define __ITERATORS_BPF_SKEL_H__
-
-#include <stdlib.h>
-#include <bpf/libbpf.h>
-
-struct iterators_bpf {
- struct bpf_object_skeleton *skeleton;
- struct bpf_object *obj;
- struct {
- struct bpf_map *rodata;
- } maps;
- struct {
- struct bpf_program *dump_bpf_map;
- struct bpf_program *dump_bpf_prog;
- } progs;
- struct {
- struct bpf_link *dump_bpf_map;
- struct bpf_link *dump_bpf_prog;
- } links;
- struct iterators_bpf__rodata {
- char dump_bpf_map____fmt[35];
- char dump_bpf_map____fmt_1[14];
- char dump_bpf_prog____fmt[32];
- char dump_bpf_prog____fmt_2[17];
- } *rodata;
-};
-
-static void
-iterators_bpf__destroy(struct iterators_bpf *obj)
-{
- if (!obj)
- return;
- if (obj->skeleton)
- bpf_object__destroy_skeleton(obj->skeleton);
- free(obj);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj);
-
-static inline struct iterators_bpf *
-iterators_bpf__open_opts(const struct bpf_object_open_opts *opts)
-{
- struct iterators_bpf *obj;
-
- obj = (struct iterators_bpf *)calloc(1, sizeof(*obj));
- if (!obj)
- return NULL;
- if (iterators_bpf__create_skeleton(obj))
- goto err;
- if (bpf_object__open_skeleton(obj->skeleton, opts))
- goto err;
-
- return obj;
-err:
- iterators_bpf__destroy(obj);
- return NULL;
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open(void)
-{
- return iterators_bpf__open_opts(NULL);
-}
-
-static inline int
-iterators_bpf__load(struct iterators_bpf *obj)
-{
- return bpf_object__load_skeleton(obj->skeleton);
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open_and_load(void)
-{
- struct iterators_bpf *obj;
-
- obj = iterators_bpf__open();
- if (!obj)
- return NULL;
- if (iterators_bpf__load(obj)) {
- iterators_bpf__destroy(obj);
- return NULL;
- }
- return obj;
-}
-
-static inline int
-iterators_bpf__attach(struct iterators_bpf *obj)
-{
- return bpf_object__attach_skeleton(obj->skeleton);
-}
-
-static inline void
-iterators_bpf__detach(struct iterators_bpf *obj)
-{
- return bpf_object__detach_skeleton(obj->skeleton);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj)
-{
- struct bpf_object_skeleton *s;
-
- s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));
- if (!s)
- return -1;
- obj->skeleton = s;
-
- s->sz = sizeof(*s);
- s->name = "iterators_bpf";
- s->obj = &obj->obj;
-
- /* maps */
- s->map_cnt = 1;
- s->map_skel_sz = sizeof(*s->maps);
- s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);
- if (!s->maps)
- goto err;
-
- s->maps[0].name = "iterator.rodata";
- s->maps[0].map = &obj->maps.rodata;
- s->maps[0].mmaped = (void **)&obj->rodata;
-
- /* programs */
- s->prog_cnt = 2;
- s->prog_skel_sz = sizeof(*s->progs);
- s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);
- if (!s->progs)
- goto err;
-
- s->progs[0].name = "dump_bpf_map";
- s->progs[0].prog = &obj->progs.dump_bpf_map;
- s->progs[0].link = &obj->links.dump_bpf_map;
-
- s->progs[1].name = "dump_bpf_prog";
- s->progs[1].prog = &obj->progs.dump_bpf_prog;
- s->progs[1].link = &obj->links.dump_bpf_prog;
-
- s->data_sz = 7176;
- s->data = (void *)"\
-\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\
-\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\
-\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
-\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\
-\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\
-\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\
-\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\
-\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\
-\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\
-\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\
-\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\
-\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\
-\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\
-\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\
-\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\
-\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\
-\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\
-\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\
-\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\
-\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\
-\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\
-\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\
-\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\
-\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\
-\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\
-\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\
-\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\
-\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\
-\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\
-\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\
-\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\
-\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\
-\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\
-\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\
-\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\
-\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\
-\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\
-\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\
-\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\
-\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\
-\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
-\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\
-\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\
-\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\
-\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\
-\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
-\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\
-\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\
-\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\
-\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\
-\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\
-\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\
-\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\
-\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\
-\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
-\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\
-\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\
-\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\
-\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\
-\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\
-\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\
-\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\
-\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\
-\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\
-\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\
-\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\
-\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\
-\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\
-\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\
-\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\
-\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\
-\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\
-\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\
-\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
-\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\
-\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\
-\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
-\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
-\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
-\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
-\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
-\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
-\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
-\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
-\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
-\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
-\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
-\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
-\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
-\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
-\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
-\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
-\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
-\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
-\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
-\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
-\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
-\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
-\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
-\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
-\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
-\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
-\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
-\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
-\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
-\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
-\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
-\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
-\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
-\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
-\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
-\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\
-\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\
-\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\
-\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\
-\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\
-\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\
-\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\
-\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\
-\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\
-\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\
-\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\
-\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\
-\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\
-\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\
-\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\
-\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\
-\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\
-\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\
-\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\
-\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\
-\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\
-\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\
-\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\
-\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\
-\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\
-\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\
-\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\
-\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\
-\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\
-\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\
-\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\
-\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\
-\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\
-\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\
-\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\
-\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\
-\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\
-\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\
-\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\
-\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\
-\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\
-\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\
-\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\
-\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\
-\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\
-\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\
-\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\
-\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\
-\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\
-\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\
-\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\
-\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\
-\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\
-\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\
-\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\
-\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\
-\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\
-\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\
-\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\
-\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\
-\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
-\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\
-\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\
-\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\
-\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\
-\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\
-\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\
-\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\
-\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\
-\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\
-\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\
-\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\
-\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\
-\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\
-\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\
-\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\
-\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\
-\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\
-\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
-
- return 0;
-err:
- bpf_object__destroy_skeleton(s);
- return -1;
-}
-
-#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 556a769b5b80..8251243022a2 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -143,7 +143,7 @@ static void reuseport_array_free(struct bpf_map *map)
/*
* Once reaching here, all sk->sk_user_data is not
- * referenceing this "array". "array" can be freed now.
+ * referencing this "array". "array" can be freed now.
*/
bpf_map_area_free(array);
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 22c8ae94e4c1..34725bfa1e97 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -132,7 +132,8 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
int i;
struct mmap_unlock_irq_work *work = NULL;
bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev_vma = NULL;
+ const char *prev_build_id;
/* If the irq_work is in use, fall back to report ips. Same
* fallback is used for kernel stack (!user) on a stackmap with
@@ -150,6 +151,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
for (i = 0; i < trace_nr; i++) {
+ if (range_in_vma(prev_vma, ips[i], ips[i])) {
+ vma = prev_vma;
+ memcpy(id_offs[i].build_id, prev_build_id,
+ BUILD_ID_SIZE_MAX);
+ goto build_id_valid;
+ }
vma = find_vma(current->mm, ips[i]);
if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
/* per entry fall back to ips */
@@ -158,15 +165,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
+build_id_valid:
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ prev_vma = vma;
+ prev_build_id = id_offs[i].build_id;
}
bpf_mmap_unlock_mm(work, current->mm);
}
static struct perf_callchain_entry *
-get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
{
#ifdef CONFIG_STACKTRACE
struct perf_callchain_entry *entry;
@@ -177,9 +187,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
if (!entry)
return NULL;
- entry->nr = init_nr +
- stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
- sysctl_perf_event_max_stack - init_nr, 0);
+ entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
+ max_depth, 0);
/* stack_trace_save_tsk() works on unsigned long array, while
* perf_callchain_entry uses u64 array. For 32-bit systems, it is
@@ -191,7 +200,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
int i;
/* copy data from the end to avoid using extra buffer */
- for (i = entry->nr - 1; i >= (int)init_nr; i--)
+ for (i = entry->nr - 1; i >= 0; i--)
to[i] = (u64)(from[i]);
}
@@ -208,27 +217,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
- u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
- /* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
- */
- trace_nr = trace->nr - init_nr;
-
- if (trace_nr <= skip)
+ if (trace->nr <= skip)
/* skipping more than usable stack trace */
return -EFAULT;
- trace_nr -= skip;
+ trace_nr = trace->nr - skip;
trace_len = trace_nr * sizeof(u64);
- ips = trace->ip + skip + init_nr;
+ ips = trace->ip + skip;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
id = hash & (smap->n_buckets - 1);
bucket = READ_ONCE(smap->buckets[id]);
@@ -285,8 +286,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
bool kernel = !user;
@@ -295,8 +295,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
return -EINVAL;
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack, false, false);
+ max_depth += skip;
+ if (max_depth > sysctl_perf_event_max_stack)
+ max_depth = sysctl_perf_event_max_stack;
+
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ false, false);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -387,7 +391,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
void *buf, u32 size, u64 flags)
{
- u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+ u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
@@ -412,30 +416,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
goto err_fault;
num_elem = size / elem_size;
- if (sysctl_perf_event_max_stack < num_elem)
- init_nr = 0;
- else
- init_nr = sysctl_perf_event_max_stack - num_elem;
+ max_depth = num_elem + skip;
+ if (sysctl_perf_event_max_stack < max_depth)
+ max_depth = sysctl_perf_event_max_stack;
if (trace_in)
trace = trace_in;
else if (kernel && task)
- trace = get_callchain_entry_for_task(task, init_nr);
+ trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack,
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
false, false);
if (unlikely(!trace))
goto err_fault;
- trace_nr = trace->nr - init_nr;
- if (trace_nr < skip)
+ if (trace->nr < skip)
goto err_fault;
- trace_nr -= skip;
+ trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
- ips = trace->ip + skip + init_nr;
+
+ ips = trace->ip + skip;
if (user && user_build_id)
stack_map_get_build_id_offset(buf, ips, trace_nr, user);
else
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ca70fe6fba38..cdaa1152436a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -32,6 +32,7 @@
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
+#include <linux/trace_events.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -556,16 +557,14 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
- const struct bpf_map *map = filp->private_data;
- const struct bpf_array *array;
+ struct bpf_map *map = filp->private_data;
u32 type = 0, jited = 0;
- if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
- array = container_of(map, struct bpf_array, map);
- spin_lock(&array->aux->owner.lock);
- type = array->aux->owner.type;
- jited = array->aux->owner.jited;
- spin_unlock(&array->aux->owner.lock);
+ if (map_type_contains_progs(map)) {
+ spin_lock(&map->owner.lock);
+ type = map->owner.type;
+ jited = map->owner.jited;
+ spin_unlock(&map->owner.lock);
}
seq_printf(m,
@@ -874,6 +873,7 @@ static int map_create(union bpf_attr *attr)
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
+ spin_lock_init(&map->owner.lock);
map->spin_lock_off = -EINVAL;
map->timer_off = -EINVAL;
@@ -986,6 +986,7 @@ struct bpf_map *bpf_map_get(u32 ufd)
return map;
}
+EXPORT_SYMBOL(bpf_map_get);
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
@@ -1352,7 +1353,6 @@ int generic_map_delete_batch(struct bpf_map *map,
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
if (err)
break;
cond_resched();
@@ -1361,6 +1361,8 @@ int generic_map_delete_batch(struct bpf_map *map,
err = -EFAULT;
kvfree(key);
+
+ maybe_wait_bpf_programs(map);
return err;
}
@@ -2220,7 +2222,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
- BPF_F_TEST_RND_HI32))
+ BPF_F_TEST_RND_HI32 |
+ BPF_F_XDP_HAS_FRAGS))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2306,6 +2309,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->aux->dst_prog = dst_prog;
prog->aux->offload_requested = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
+ prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
err = security_bpf_prog_alloc(prog->aux);
if (err)
@@ -2491,6 +2495,7 @@ void bpf_link_put(struct bpf_link *link)
bpf_link_free(link);
}
}
+EXPORT_SYMBOL(bpf_link_put);
static int bpf_link_release(struct inode *inode, struct file *filp)
{
@@ -2562,7 +2567,7 @@ static int bpf_link_alloc_id(struct bpf_link *link)
* pre-allocated resources are to be freed with bpf_cleanup() call. All the
* transient state is passed around in struct bpf_link_primer.
* This is preferred way to create and initialize bpf_link, especially when
- * there are complicated and expensive operations inbetween creating bpf_link
+ * there are complicated and expensive operations in between creating bpf_link
* itself and attaching it to BPF hook. By using bpf_link_prime() and
* bpf_link_settle() kernel code using bpf_link doesn't have to perform
* expensive (and potentially failing) roll back operations in a rare case
@@ -2633,6 +2638,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
return link;
}
+EXPORT_SYMBOL(bpf_link_get_from_fd);
struct bpf_tracing_link {
struct bpf_link link;
@@ -3017,6 +3023,11 @@ out_put_file:
fput(perf_file);
return err;
}
+#else
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_PERF_EVENTS */
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
@@ -3321,12 +3332,17 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_FLOW_DISSECTOR:
case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_MSG_VERDICT:
+ case BPF_SK_SKB_VERDICT:
+ return sock_map_bpf_prog_query(attr, uattr);
default:
return -EINVAL;
}
}
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -4245,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
return -EINVAL;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
+#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type ptype;
@@ -4269,7 +4285,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = tracing_bpf_link_attach(attr, uattr, prog);
goto out;
case BPF_PROG_TYPE_PERF_EVENT:
- case BPF_PROG_TYPE_KPROBE:
case BPF_PROG_TYPE_TRACEPOINT:
if (attr->link_create.attach_type != BPF_PERF_EVENT) {
ret = -EINVAL;
@@ -4277,6 +4292,14 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
}
ptype = prog->type;
break;
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type != BPF_PERF_EVENT &&
+ attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ptype = prog->type;
+ break;
default:
ptype = attach_type_to_prog_type(attr->link_create.attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
@@ -4308,13 +4331,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_xdp_link_attach(attr, prog);
break;
#endif
-#ifdef CONFIG_PERF_EVENTS
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
- case BPF_PROG_TYPE_KPROBE:
ret = bpf_perf_link_attach(attr, prog);
break;
-#endif
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type == BPF_PERF_EVENT)
+ ret = bpf_perf_link_attach(attr, prog);
+ else
+ ret = bpf_kprobe_multi_link_attach(attr, prog);
+ break;
default:
ret = -EINVAL;
}
@@ -4753,23 +4779,52 @@ static bool syscall_prog_is_valid_access(int off, int size,
return true;
}
-BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
+BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
+ struct bpf_prog * __maybe_unused prog;
+
switch (cmd) {
case BPF_MAP_CREATE:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
+ case BPF_LINK_CREATE:
+ case BPF_RAW_TRACEPOINT_OPEN:
break;
- /* case BPF_PROG_TEST_RUN:
- * is not part of this list to prevent recursive test_run
- */
+#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
+ case BPF_PROG_TEST_RUN:
+ if (attr->test.data_in || attr->test.data_out ||
+ attr->test.ctx_out || attr->test.duration ||
+ attr->test.repeat || attr->test.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
+ attr->test.ctx_size_in > U16_MAX) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ if (!__bpf_prog_enter_sleepable(prog)) {
+ /* recursion detected */
+ bpf_prog_put(prog);
+ return -EBUSY;
+ }
+ attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
+ __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */);
+ bpf_prog_put(prog);
+ return 0;
+#endif
default:
return -EINVAL;
}
return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
}
+EXPORT_SYMBOL(bpf_sys_bpf);
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 5e7edf913060..ada97751ae1b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -45,7 +45,7 @@ void *bpf_jit_alloc_exec_page(void)
set_vm_flush_reset_perms(image);
/* Keep image as writeable. The alternative is to keep flipping ro/rw
- * everytime new program is attached or detached.
+ * every time new program is attached or detached.
*/
set_memory_x((long)image, 1);
return image;
@@ -117,18 +117,6 @@ static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
tr->mod = NULL;
}
-static int is_ftrace_location(void *ip)
-{
- long addr;
-
- addr = ftrace_location((long)ip);
- if (!addr)
- return 0;
- if (WARN_ON_ONCE(addr != (long)ip))
- return -EFAULT;
- return 1;
-}
-
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
{
void *ip = tr->func.addr;
@@ -160,12 +148,12 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
{
void *ip = tr->func.addr;
+ unsigned long faddr;
int ret;
- ret = is_ftrace_location(ip);
- if (ret < 0)
- return ret;
- tr->func.ftrace_managed = ret;
+ faddr = ftrace_location((unsigned long)ip);
+ if (faddr)
+ tr->func.ftrace_managed = true;
if (bpf_trampoline_module_get(tr))
return -ENOENT;
@@ -213,7 +201,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
im = container_of(work, struct bpf_tramp_image, work);
bpf_image_ksym_del(&im->ksym);
bpf_jit_free_exec(im->image);
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
percpu_ref_exit(&im->pcref);
kfree_rcu(im, rcu);
}
@@ -310,7 +298,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
if (!im)
goto out;
- err = bpf_jit_charge_modmem(1);
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
if (err)
goto out_free_im;
@@ -332,7 +320,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
out_free_image:
bpf_jit_free_exec(im->image);
out_uncharge:
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
out_free_im:
kfree(im);
out:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a39eedecc93a..d175b70067b3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -452,7 +452,8 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
{
return base_type(type) == PTR_TO_SOCKET ||
base_type(type) == PTR_TO_TCP_SOCK ||
- base_type(type) == PTR_TO_MEM;
+ base_type(type) == PTR_TO_MEM ||
+ base_type(type) == PTR_TO_BTF_ID;
}
static bool type_is_rdonly_mem(u32 type)
@@ -535,10 +536,10 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
static const char *reg_type_str(struct bpf_verifier_env *env,
enum bpf_reg_type type)
{
- char postfix[16] = {0}, prefix[16] = {0};
+ char postfix[16] = {0}, prefix[32] = {0};
static const char * const str[] = {
[NOT_INIT] = "?",
- [SCALAR_VALUE] = "inv",
+ [SCALAR_VALUE] = "scalar",
[PTR_TO_CTX] = "ctx",
[CONST_PTR_TO_MAP] = "map_ptr",
[PTR_TO_MAP_VALUE] = "map_value",
@@ -553,7 +554,6 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
[PTR_TO_TP_BUFFER] = "tp_buffer",
[PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_",
- [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
[PTR_TO_MEM] = "mem",
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
@@ -561,17 +561,20 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
};
if (type & PTR_MAYBE_NULL) {
- if (base_type(type) == PTR_TO_BTF_ID ||
- base_type(type) == PTR_TO_PERCPU_BTF_ID)
+ if (base_type(type) == PTR_TO_BTF_ID)
strncpy(postfix, "or_null_", 16);
else
strncpy(postfix, "_or_null", 16);
}
if (type & MEM_RDONLY)
- strncpy(prefix, "rdonly_", 16);
+ strncpy(prefix, "rdonly_", 32);
if (type & MEM_ALLOC)
- strncpy(prefix, "alloc_", 16);
+ strncpy(prefix, "alloc_", 32);
+ if (type & MEM_USER)
+ strncpy(prefix, "user_", 32);
+ if (type & MEM_PERCPU)
+ strncpy(prefix, "percpu_", 32);
snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
prefix, str[base_type(type)], postfix);
@@ -682,74 +685,79 @@ static void print_verifier_state(struct bpf_verifier_env *env,
continue;
verbose(env, " R%d", i);
print_liveness(env, reg->live);
- verbose(env, "=%s", reg_type_str(env, t));
+ verbose(env, "=");
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
+ verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- if (base_type(t) == PTR_TO_BTF_ID ||
- base_type(t) == PTR_TO_PERCPU_BTF_ID)
+ const char *sep = "";
+
+ verbose(env, "%s", reg_type_str(env, t));
+ if (base_type(t) == PTR_TO_BTF_ID)
verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
- verbose(env, "(id=%d", reg->id);
- if (reg_type_may_be_refcounted_or_null(t))
- verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
+ verbose(env, "(");
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
+
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
+ if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id)
+ verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (t != SCALAR_VALUE)
- verbose(env, ",off=%d", reg->off);
+ verbose_a("off=%d", reg->off);
if (type_is_pkt_pointer(t))
- verbose(env, ",r=%d", reg->range);
+ verbose_a("r=%d", reg->range);
else if (base_type(t) == CONST_PTR_TO_MAP ||
base_type(t) == PTR_TO_MAP_KEY ||
base_type(t) == PTR_TO_MAP_VALUE)
- verbose(env, ",ks=%d,vs=%d",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size);
+ verbose_a("ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
if (tnum_is_const(reg->var_off)) {
/* Typically an immediate SCALAR_VALUE, but
* could be a pointer whose offset is too big
* for reg->off
*/
- verbose(env, ",imm=%llx", reg->var_off.value);
+ verbose_a("imm=%llx", reg->var_off.value);
} else {
if (reg->smin_value != reg->umin_value &&
reg->smin_value != S64_MIN)
- verbose(env, ",smin_value=%lld",
- (long long)reg->smin_value);
+ verbose_a("smin=%lld", (long long)reg->smin_value);
if (reg->smax_value != reg->umax_value &&
reg->smax_value != S64_MAX)
- verbose(env, ",smax_value=%lld",
- (long long)reg->smax_value);
+ verbose_a("smax=%lld", (long long)reg->smax_value);
if (reg->umin_value != 0)
- verbose(env, ",umin_value=%llu",
- (unsigned long long)reg->umin_value);
+ verbose_a("umin=%llu", (unsigned long long)reg->umin_value);
if (reg->umax_value != U64_MAX)
- verbose(env, ",umax_value=%llu",
- (unsigned long long)reg->umax_value);
+ verbose_a("umax=%llu", (unsigned long long)reg->umax_value);
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, ",var_off=%s", tn_buf);
+ verbose_a("var_off=%s", tn_buf);
}
if (reg->s32_min_value != reg->smin_value &&
reg->s32_min_value != S32_MIN)
- verbose(env, ",s32_min_value=%d",
- (int)(reg->s32_min_value));
+ verbose_a("s32_min=%d", (int)(reg->s32_min_value));
if (reg->s32_max_value != reg->smax_value &&
reg->s32_max_value != S32_MAX)
- verbose(env, ",s32_max_value=%d",
- (int)(reg->s32_max_value));
+ verbose_a("s32_max=%d", (int)(reg->s32_max_value));
if (reg->u32_min_value != reg->umin_value &&
reg->u32_min_value != U32_MIN)
- verbose(env, ",u32_min_value=%d",
- (int)(reg->u32_min_value));
+ verbose_a("u32_min=%d", (int)(reg->u32_min_value));
if (reg->u32_max_value != reg->umax_value &&
reg->u32_max_value != U32_MAX)
- verbose(env, ",u32_max_value=%d",
- (int)(reg->u32_max_value));
+ verbose_a("u32_max=%d", (int)(reg->u32_max_value));
}
+#undef verbose_a
+
verbose(env, ")");
}
}
@@ -774,7 +782,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (is_spilled_reg(&state->stack[i])) {
reg = &state->stack[i].spilled_ptr;
t = reg->type;
- verbose(env, "=%s", reg_type_str(env, t));
+ verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
@@ -1546,14 +1554,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
static void mark_btf_ld_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno,
enum bpf_reg_type reg_type,
- struct btf *btf, u32 btf_id)
+ struct btf *btf, u32 btf_id,
+ enum bpf_type_flag flag)
{
if (reg_type == SCALAR_VALUE) {
mark_reg_unknown(env, regs, regno);
return;
}
mark_reg_known_zero(env, regs, regno);
- regs[regno].type = PTR_TO_BTF_ID;
+ regs[regno].type = PTR_TO_BTF_ID | flag;
regs[regno].btf = btf;
regs[regno].btf_id = btf_id;
}
@@ -1743,7 +1752,7 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
}
static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
- s16 offset, struct module **btf_modp)
+ s16 offset)
{
struct bpf_kfunc_btf kf_btf = { .offset = offset };
struct bpf_kfunc_btf_tab *tab;
@@ -1797,8 +1806,6 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
kfunc_btf_cmp_by_off, NULL);
}
- if (btf_modp)
- *btf_modp = b->module;
return b->btf;
}
@@ -1815,8 +1822,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
}
static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
- u32 func_id, s16 offset,
- struct module **btf_modp)
+ u32 func_id, s16 offset)
{
if (offset) {
if (offset < 0) {
@@ -1827,7 +1833,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
return ERR_PTR(-EINVAL);
}
- return __find_kfunc_desc_btf(env, offset, btf_modp);
+ return __find_kfunc_desc_btf(env, offset);
}
return btf_vmlinux ?: ERR_PTR(-ENOENT);
}
@@ -1841,6 +1847,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
struct bpf_kfunc_desc *desc;
const char *func_name;
struct btf *desc_btf;
+ unsigned long call_imm;
unsigned long addr;
int err;
@@ -1890,7 +1897,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
prog_aux->kfunc_btf_tab = btf_tab;
}
- desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL);
+ desc_btf = find_kfunc_desc_btf(env, func_id, offset);
if (IS_ERR(desc_btf)) {
verbose(env, "failed to find BTF for kernel function\n");
return PTR_ERR(desc_btf);
@@ -1925,9 +1932,17 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -EINVAL;
}
+ call_imm = BPF_CALL_IMM(addr);
+ /* Check whether or not the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel function %s is out of range\n",
+ func_name);
+ return -EINVAL;
+ }
+
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
- desc->imm = BPF_CALL_IMM(addr);
+ desc->imm = call_imm;
desc->offset = offset;
err = btf_distill_func_proto(&env->log, desc_btf,
func_proto, func_name,
@@ -2351,7 +2366,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
return NULL;
- desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL);
+ desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off);
if (IS_ERR(desc_btf))
return "<error>";
@@ -2767,7 +2782,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
case PTR_TO_BUF:
- case PTR_TO_PERCPU_BTF_ID:
case PTR_TO_MEM:
case PTR_TO_FUNC:
case PTR_TO_MAP_KEY:
@@ -3498,11 +3512,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
#define MAX_PACKET_OFF 0xffff
-static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
-{
- return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
-}
-
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_access_type t)
@@ -3979,6 +3988,12 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
* is only allowed in its original, unmodified form.
*/
+ if (reg->off < 0) {
+ verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
if (!fixed_off_ok && reg->off) {
verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
reg_type_str(env, reg->type), regno, reg->off);
@@ -4047,9 +4062,9 @@ static int check_buffer_access(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int regno, int off, int size,
bool zero_size_allowed,
- const char *buf_info,
u32 *max_access)
{
+ const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
int err;
err = __check_buffer_access(env, buf_info, reg, regno, off, size);
@@ -4159,6 +4174,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *reg = regs + regno;
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+ enum bpf_type_flag flag = 0;
u32 btf_id;
int ret;
@@ -4178,9 +4194,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
+ if (reg->type & MEM_USER) {
+ verbose(env,
+ "R%d is ptr_%s access user memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (reg->type & MEM_PERCPU) {
+ verbose(env,
+ "R%d is ptr_%s access percpu memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
if (env->ops->btf_struct_access) {
ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
- off, size, atype, &btf_id);
+ off, size, atype, &btf_id, &flag);
} else {
if (atype != BPF_READ) {
verbose(env, "only read is supported\n");
@@ -4188,14 +4218,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
}
ret = btf_struct_access(&env->log, reg->btf, t, off, size,
- atype, &btf_id);
+ atype, &btf_id, &flag);
}
if (ret < 0)
return ret;
if (atype == BPF_READ && value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id);
+ mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
return 0;
}
@@ -4208,6 +4238,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
{
struct bpf_reg_state *reg = regs + regno;
struct bpf_map *map = reg->map_ptr;
+ enum bpf_type_flag flag = 0;
const struct btf_type *t;
const char *tname;
u32 btf_id;
@@ -4245,12 +4276,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id);
+ ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag);
if (ret < 0)
return ret;
if (value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id);
+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
return 0;
}
@@ -4451,7 +4482,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err < 0)
return err;
- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id);
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
+ &btf_id);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@@ -4535,7 +4567,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_tp_buffer_access(env, reg, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_BTF_ID) {
+ } else if (base_type(reg->type) == PTR_TO_BTF_ID &&
+ !type_may_be_null(reg->type)) {
err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
value_regno);
} else if (reg->type == CONST_PTR_TO_MAP) {
@@ -4543,7 +4576,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
value_regno);
} else if (base_type(reg->type) == PTR_TO_BUF) {
bool rdonly_mem = type_is_rdonly_mem(reg->type);
- const char *buf_info;
u32 *max_access;
if (rdonly_mem) {
@@ -4552,15 +4584,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
regno, reg_type_str(env, reg->type));
return -EACCES;
}
- buf_info = "rdonly";
max_access = &env->prog->aux->max_rdonly_access;
} else {
- buf_info = "rdwr";
max_access = &env->prog->aux->max_rdwr_access;
}
err = check_buffer_access(env, reg, regno, off, size, false,
- buf_info, max_access);
+ max_access);
if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
mark_reg_unknown(env, regs, value_regno);
@@ -4781,7 +4811,7 @@ static int check_stack_range_initialized(
}
if (is_spilled_reg(&state->stack[spi]) &&
- state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
+ base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID)
goto mark;
if (is_spilled_reg(&state->stack[spi]) &&
@@ -4823,7 +4853,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- const char *buf_info;
u32 *max_access;
switch (base_type(reg->type)) {
@@ -4850,15 +4879,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
if (meta && meta->raw_mode)
return -EACCES;
- buf_info = "rdonly";
max_access = &env->prog->aux->max_rdonly_access;
} else {
- buf_info = "rdwr";
max_access = &env->prog->aux->max_rdwr_access;
}
return check_buffer_access(env, reg, regno, reg->off,
access_size, zero_size_allowed,
- buf_info, max_access);
+ max_access);
case PTR_TO_STACK:
return check_stack_range_initialized(
env,
@@ -4877,6 +4904,62 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
+static int check_mem_size_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
+{
+ int err;
+
+ /* This is used to refine r0 return value bounds for helpers
+ * that enforce this value as an upper bound on return values.
+ * See do_refine_retval_range() for helpers that can refine
+ * the return value. C type of helper is u32 so we pull register
+ * bound from umax_value however, if negative verifier errors
+ * out. Only upper bounds can be learned because retval is an
+ * int type and negative retvals are allowed.
+ */
+ if (meta)
+ meta->msize_max_value = reg->umax_value;
+
+ /* The register is SCALAR_VALUE; the access check
+ * happens using its boundaries.
+ */
+ if (!tnum_is_const(reg->var_off))
+ /* For unprivileged variable accesses, disable raw
+ * mode so that the program is required to
+ * initialize all the memory that the helper could
+ * just partially fill up.
+ */
+ meta = NULL;
+
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (reg->umin_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
+ if (err)
+ return err;
+ }
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->umax_value,
+ zero_size_allowed, meta);
+ if (!err)
+ err = mark_chain_precision(env, regno);
+ return err;
+}
+
int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
u32 regno, u32 mem_size)
{
@@ -4900,6 +4983,28 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return check_helper_mem_access(env, regno, mem_size, true, NULL);
}
+int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno)
+{
+ struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
+ bool may_be_null = type_may_be_null(mem_reg->type);
+ struct bpf_reg_state saved_reg;
+ int err;
+
+ WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+
+ if (may_be_null) {
+ saved_reg = *mem_reg;
+ mark_ptr_not_null_reg(mem_reg);
+ }
+
+ err = check_mem_size_reg(env, reg, regno, true, NULL);
+
+ if (may_be_null)
+ *mem_reg = saved_reg;
+ return err;
+}
+
/* Implementation details:
* bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
* Two bpf_map_lookups (even with the same key) will have different reg->id.
@@ -5159,7 +5264,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | ME
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } };
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
@@ -5260,6 +5365,60 @@ found:
return 0;
}
+int check_func_arg_reg_off(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ enum bpf_arg_type arg_type,
+ bool is_release_func)
+{
+ bool fixed_off_ok = false, release_reg;
+ enum bpf_reg_type type = reg->type;
+
+ switch ((u32)type) {
+ case SCALAR_VALUE:
+ /* Pointer types where reg offset is explicitly allowed: */
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_MEM | MEM_RDONLY:
+ case PTR_TO_MEM | MEM_ALLOC:
+ case PTR_TO_BUF:
+ case PTR_TO_BUF | MEM_RDONLY:
+ case PTR_TO_STACK:
+ /* Some of the argument types nevertheless require a
+ * zero register offset.
+ */
+ if (arg_type != ARG_PTR_TO_ALLOC_MEM)
+ return 0;
+ break;
+ /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
+ * fixed offset.
+ */
+ case PTR_TO_BTF_ID:
+ /* When referenced PTR_TO_BTF_ID is passed to release function,
+ * it's fixed offset must be 0. We rely on the property that
+ * only one referenced register can be passed to BPF helpers and
+ * kfuncs. In the other cases, fixed offset can be non-zero.
+ */
+ release_reg = is_release_func && reg->ref_obj_id;
+ if (release_reg && reg->off) {
+ verbose(env, "R%d must have zero offset when passed to release func\n",
+ regno);
+ return -EINVAL;
+ }
+ /* For release_reg == true, fixed_off_ok must be false, but we
+ * already checked and rejected reg->off != 0 above, so set to
+ * true to allow fixed offset for all other cases.
+ */
+ fixed_off_ok = true;
+ break;
+ default:
+ break;
+ }
+ return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
const struct bpf_func_proto *fn)
@@ -5309,36 +5468,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
if (err)
return err;
- switch ((u32)type) {
- case SCALAR_VALUE:
- /* Pointer types where reg offset is explicitly allowed: */
- case PTR_TO_PACKET:
- case PTR_TO_PACKET_META:
- case PTR_TO_MAP_KEY:
- case PTR_TO_MAP_VALUE:
- case PTR_TO_MEM:
- case PTR_TO_MEM | MEM_RDONLY:
- case PTR_TO_MEM | MEM_ALLOC:
- case PTR_TO_BUF:
- case PTR_TO_BUF | MEM_RDONLY:
- case PTR_TO_STACK:
- /* Some of the argument types nevertheless require a
- * zero register offset.
- */
- if (arg_type == ARG_PTR_TO_ALLOC_MEM)
- goto force_off_check;
- break;
- /* All the rest must be rejected: */
- default:
-force_off_check:
- err = __check_ptr_off_reg(env, reg, regno,
- type == PTR_TO_BTF_ID);
- if (err < 0)
- return err;
- break;
- }
+ err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id));
+ if (err)
+ return err;
skip_type_check:
+ /* check_func_arg_reg_off relies on only one referenced register being
+ * allowed for BPF helpers.
+ */
if (reg->ref_obj_id) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
@@ -5439,51 +5576,7 @@ skip_type_check:
} else if (arg_type_is_mem_size(arg_type)) {
bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
- /* This is used to refine r0 return value bounds for helpers
- * that enforce this value as an upper bound on return values.
- * See do_refine_retval_range() for helpers that can refine
- * the return value. C type of helper is u32 so we pull register
- * bound from umax_value however, if negative verifier errors
- * out. Only upper bounds can be learned because retval is an
- * int type and negative retvals are allowed.
- */
- meta->msize_max_value = reg->umax_value;
-
- /* The register is SCALAR_VALUE; the access check
- * happens using its boundaries.
- */
- if (!tnum_is_const(reg->var_off))
- /* For unprivileged variable accesses, disable raw
- * mode so that the program is required to
- * initialize all the memory that the helper could
- * just partially fill up.
- */
- meta = NULL;
-
- if (reg->smin_value < 0) {
- verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
- return -EACCES;
- }
-
- if (reg->umin_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
- if (err)
- return err;
- }
-
- if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
- }
- err = check_helper_mem_access(env, regno - 1,
- reg->umax_value,
- zero_size_allowed, meta);
- if (!err)
- err = mark_chain_precision(env, regno);
+ err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
} else if (arg_type_is_alloc_size(arg_type)) {
if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d is not a known constant'\n",
@@ -6842,22 +6935,23 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
{
const struct btf_type *t, *func, *func_proto, *ptr_type;
struct bpf_reg_state *regs = cur_regs(env);
const char *func_name, *ptr_type_name;
u32 i, nargs, func_id, ptr_type_id;
- struct module *btf_mod = NULL;
+ int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
struct btf *desc_btf;
- int err;
+ bool acq;
/* skip for now, but return error when we find this in fixup_kfunc_call */
if (!insn->imm)
return 0;
- desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod);
+ desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off);
if (IS_ERR(desc_btf))
return PTR_ERR(desc_btf);
@@ -6866,23 +6960,43 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
func_name = btf_name_by_offset(desc_btf, func->name_off);
func_proto = btf_type_by_id(desc_btf, func->type);
- if (!env->ops->check_kfunc_call ||
- !env->ops->check_kfunc_call(func_id, btf_mod)) {
+ if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_CHECK, func_id)) {
verbose(env, "calling kernel function %s is not allowed\n",
func_name);
return -EACCES;
}
+ acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_ACQUIRE, func_id);
+
/* Check the arguments */
err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs);
- if (err)
+ if (err < 0)
return err;
+ /* In case of release function, we get register number of refcounted
+ * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now
+ */
+ if (err) {
+ err = release_reference(env, regs[err].ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, func_id);
+ return err;
+ }
+ }
for (i = 0; i < CALLER_SAVED_REGS; i++)
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+
+ if (acq && !btf_type_is_ptr(t)) {
+ verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+ return -EINVAL;
+ }
+
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
@@ -6901,7 +7015,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
+ if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_RET_NULL, func_id)) {
+ regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
+ /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
+ regs[BPF_REG_0].id = ++env->id_gen;
+ }
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+ if (acq) {
+ int id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ regs[BPF_REG_0].id = id;
+ regs[BPF_REG_0].ref_obj_id = id;
+ }
} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
nargs = btf_type_vlen(func_proto);
@@ -9548,7 +9676,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
dst_reg->mem_size = aux->btf_var.mem_size;
break;
case PTR_TO_BTF_ID:
- case PTR_TO_PERCPU_BTF_ID:
dst_reg->btf = aux->btf_var.btf;
dst_reg->btf_id = aux->btf_var.btf_id;
break;
@@ -10273,8 +10400,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
aux->func_info[i].insn_off = env->subprog_info[i].start;
}
-#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
- sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
static int check_btf_line(struct bpf_verifier_env *env,
@@ -11549,7 +11675,7 @@ static int do_check(struct bpf_verifier_env *env)
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
- err = check_kfunc_call(env, insn);
+ err = check_kfunc_call(env, insn, &env->insn_idx);
else
err = check_helper_call(env, insn, &env->insn_idx);
if (err)
@@ -11748,7 +11874,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
type = t->type;
t = btf_type_skip_modifiers(btf, type, NULL);
if (percpu) {
- aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
+ aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
} else if (!btf_type_is_struct(t)) {
@@ -12897,6 +13023,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->blinding_requested = prog->blinding_requested;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
func[i]->aux->linfo = prog->aux->linfo;
@@ -12992,6 +13119,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
+ prog->jited_len = func[0]->jited_len;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
bpf_prog_jit_attempt_done(prog);
@@ -13019,6 +13147,7 @@ out_free:
out_undo_insn:
/* cleanup main prog to be interpreted */
prog->jit_requested = 0;
+ prog->blinding_requested = 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (!bpf_pseudo_call(insn))
continue;
@@ -13112,7 +13241,6 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
enum bpf_attach_type eatype = prog->expected_attach_type;
- bool expect_blinding = bpf_jit_blinding_enabled(prog);
enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
@@ -13276,7 +13404,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (env->bpf_capable && !expect_blinding &&
+ if (env->bpf_capable && !prog->blinding_requested &&
prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
@@ -13364,6 +13492,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ if (insn->imm == BPF_FUNC_task_storage_get ||
+ insn->imm == BPF_FUNC_sk_storage_get ||
+ insn->imm == BPF_FUNC_inode_storage_get) {
+ if (env->prog->aux->sleepable)
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
+ else
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
diff --git a/kernel/capability.c b/kernel/capability.c
index 46a361dde042..765194f5d678 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -360,6 +360,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
{
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability_noaudit);
static bool ns_capable_common(struct user_namespace *ns,
int cap,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a557eea7166f..adb820e98f24 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1302,7 +1302,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct cgroup *root_cgrp = kf_root->kn->priv;
+ struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
return root_cgrp->root;
}
@@ -2025,7 +2025,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
- root_cgrp->kn = root->kf_root->kn;
+ root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
@@ -6243,6 +6243,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
@@ -6305,6 +6306,7 @@ void cgroup_cancel_fork(struct task_struct *child,
/**
* cgroup_post_fork - finalize cgroup setup for the child process
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* Attach the child process to its css_set calling the subsystem fork()
* callbacks.
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ef88cc366bb8..71a418858a5e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/*
* There could be abnormal cpuset configurations for cpu or memory
- * node binding, add this key to provide a quick low-cost judgement
+ * node binding, add this key to provide a quick low-cost judgment
* of the situation.
*/
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
@@ -1181,7 +1181,7 @@ enum subparts_cmd {
* effective_cpus. The function will return 0 if all the CPUs listed in
* cpus_allowed can be granted or an error code will be returned.
*
- * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
* into parent's effective_cpus. 0 should always be returned.
@@ -2027,7 +2027,7 @@ out:
}
/*
- * update_prstate - update partititon_root_state
+ * update_prstate - update partition_root_state
* cs: the cpuset to update
* new_prs: new partition root state
*
@@ -2879,7 +2879,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
* set. This flag handling is implemented in cgroup core for
- * histrical reasons - the flag may be specified during mount.
+ * historical reasons - the flag may be specified during mount.
*
* Currently, if any sibling cpusets have exclusive cpus or mem, we
* refuse to clone the configuration - thereby refusing the task to
@@ -3076,7 +3076,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migratecd to an ancestor.
+ * as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
update_tasks_cpumask(cs);
@@ -3390,8 +3390,11 @@ static struct notifier_block cpuset_track_online_nodes_nb = {
*/
void __init cpuset_init_smp(void)
{
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- top_cpuset.mems_allowed = node_states[N_MEMORY];
+ /*
+ * cpus_allowd/mems_allowed set to v2 values in the initial
+ * cpuset_bind() call will be reset to v1 values in another
+ * cpuset_bind() call when v1 cpuset is mounted.
+ */
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 3984dd6b8ddb..617861a54793 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 9d331ba44870..24b5c2ab5598 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -153,8 +153,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
+ unsigned long flags;
- raw_spin_lock(cpu_lock);
+ /*
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
+ */
+ raw_spin_lock_irqsave(cpu_lock, flags);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
@@ -166,7 +175,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||
@@ -315,7 +324,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_base_stat cur, delta;
+ struct cgroup_base_stat delta;
unsigned seq;
/* Root-level stats are sourced from system-wide CPU stats */
@@ -325,11 +334,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
- cur.cputime = rstatc->bstat.cputime;
+ delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
/* propagate percpu delta to global */
- delta = cur;
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index eb0029c9a6a6..e400fbbc8aba 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -1,5 +1,5 @@
# KEEP ALPHABETICALLY SORTED
-# CONFIG_AIO is not set
+# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
# CONFIG_INPUT_MOUSE is not set
# CONFIG_LEGACY_PTYS is not set
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
new file mode 100644
index 000000000000..dcd86f32f4ed
--- /dev/null
+++ b/kernel/configs/x86_debug.config
@@ -0,0 +1,18 @@
+CONFIG_X86_DEBUG_FPU=y
+CONFIG_LOCK_STAT=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_VMACACHE=y
+CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_SLAB=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_SLUB_DEBUG_ON=y
+CONFIG_KMEMCHECK=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_GCOV_KERNEL=y
+CONFIG_LOCKDEP=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_SCHEDSTATS=y
+CONFIG_VMLINUX_VALIDATION=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5797c2a7a93f..bbad5e375d3b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -35,6 +35,7 @@
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
#include <linux/random.h>
+#include <linux/cc_platform.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -71,7 +72,6 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
- int cpu;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -475,7 +475,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
#endif
static inline enum cpuhp_state
-cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
bool bringup = st->state < target;
@@ -486,14 +486,15 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
st->target = target;
st->single = false;
st->bringup = bringup;
- if (cpu_dying(st->cpu) != !bringup)
- set_cpu_dying(st->cpu, !bringup);
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
return prev_state;
}
static inline void
-cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state prev_state)
{
bool bringup = !st->bringup;
@@ -520,8 +521,8 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
}
st->bringup = bringup;
- if (cpu_dying(st->cpu) != !bringup)
- set_cpu_dying(st->cpu, !bringup);
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
}
/* Regular hotplug invocation of the AP hotplug thread */
@@ -541,15 +542,16 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
wait_for_ap_thread(st, st->bringup);
}
-static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
{
enum cpuhp_state prev_state;
int ret;
- prev_state = cpuhp_set_state(st, target);
+ prev_state = cpuhp_set_state(cpu, st, target);
__cpuhp_kick_ap(st);
if ((ret = st->result)) {
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
}
@@ -581,7 +583,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
- return cpuhp_kick_ap(st, st->target);
+ return cpuhp_kick_ap(cpu, st, st->target);
}
static int bringup_cpu(unsigned int cpu)
@@ -704,7 +706,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret, cpu, cpuhp_get_step(st->state)->name,
st->state);
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
if (can_rollback_cpu(st))
WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
prev_state));
@@ -715,15 +717,6 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
/*
* The cpu hotplug threads manage the bringup and teardown of the cpus
*/
-static void cpuhp_create(unsigned int cpu)
-{
- struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-
- init_completion(&st->done_up);
- init_completion(&st->done_down);
- st->cpu = cpu;
-}
-
static int cpuhp_should_run(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
@@ -875,7 +868,7 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
cpuhp_lock_release(true);
trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
- ret = cpuhp_kick_ap(st, st->target);
+ ret = cpuhp_kick_ap(cpu, st, st->target);
trace_cpuhp_exit(cpu, st->state, prev_state, ret);
return ret;
@@ -883,15 +876,27 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
static struct smp_hotplug_thread cpuhp_threads = {
.store = &cpuhp_state.thread,
- .create = &cpuhp_create,
.thread_should_run = cpuhp_should_run,
.thread_fn = cpuhp_thread_fun,
.thread_comm = "cpuhp/%u",
.selfparking = true,
};
+static __init void cpuhp_init_state(void)
+{
+ struct cpuhp_cpu_state *st;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ st = per_cpu_ptr(&cpuhp_state, cpu);
+ init_completion(&st->done_up);
+ init_completion(&st->done_down);
+ }
+}
+
void __init cpuhp_threads_init(void)
{
+ cpuhp_init_state();
BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
kthread_unpark(this_cpu_read(cpuhp_state.thread));
}
@@ -1107,7 +1112,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret, cpu, cpuhp_get_step(st->state)->name,
st->state);
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
if (st->state < prev_state)
WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
@@ -1134,7 +1139,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_tasks_frozen = tasks_frozen;
- prev_state = cpuhp_set_state(st, target);
+ prev_state = cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
@@ -1165,7 +1170,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state < prev_state) {
if (st->state == CPUHP_TEARDOWN_CPU) {
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
} else {
WARN(1, "DEAD callback error for CPU%d", cpu);
@@ -1186,6 +1191,12 @@ out:
static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
+ /*
+ * If the platform does not support hotplug, report it explicitly to
+ * differentiate it from a transient offlining failure.
+ */
+ if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
+ return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
return _cpu_down(cpu, 0, target);
@@ -1352,7 +1363,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
cpuhp_tasks_frozen = tasks_frozen;
- cpuhp_set_state(st, target);
+ cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 256cf6db573c..4d57c03714f4 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -243,9 +243,8 @@ static int __init __parse_crashkernel(char *cmdline,
*crash_base = 0;
ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
if (!ck_cmdline)
- return -EINVAL;
+ return -ENOENT;
ck_cmdline += strlen(name);
diff --git a/kernel/cred.c b/kernel/cred.c
index 933155c96922..e10c15f51c1f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -870,7 +870,7 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
/*
* report use of invalid credentials
*/
-void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
{
printk(KERN_ERR "CRED: Invalid credentials\n");
printk(KERN_ERR "CRED: At %s:%u\n", file, line);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index da06a5553835..7beceb447211 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -53,6 +53,7 @@
#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <linux/irq.h>
+#include <linux/security.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
@@ -752,6 +753,29 @@ cpu_master_loop:
continue;
kgdb_connected = 0;
} else {
+ /*
+ * This is a brutal way to interfere with the debugger
+ * and prevent gdb being used to poke at kernel memory.
+ * This could cause trouble if lockdown is applied when
+ * there is already an active gdb session. For now the
+ * answer is simply "don't do that". Typically lockdown
+ * *will* be applied before the debug core gets started
+ * so only developers using kgdb for fairly advanced
+ * early kernel debug can be biten by this. Hopefully
+ * they are sophisticated enough to take care of
+ * themselves, especially with help from the lockdown
+ * message printed on the console!
+ */
+ if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) {
+ if (IS_ENABLED(CONFIG_KGDB_KDB)) {
+ /* Switch back to kdb if possible... */
+ dbg_kdb_mode = 1;
+ continue;
+ } else {
+ /* ... otherwise just bail */
+ break;
+ }
+ }
error = gdb_serial_stub(ks);
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0852a537dad4..ead4da947127 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -45,6 +45,7 @@
#include <linux/proc_fs.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include "kdb_private.h"
#undef MODULE_PARAM_PREFIX
@@ -166,10 +167,62 @@ struct task_struct *kdb_curr_task(int cpu)
}
/*
- * Check whether the flags of the current command and the permissions
- * of the kdb console has allow a command to be run.
+ * Update the permissions flags (kdb_cmd_enabled) to match the
+ * current lockdown state.
+ *
+ * Within this function the calls to security_locked_down() are "lazy". We
+ * avoid calling them if the current value of kdb_cmd_enabled already excludes
+ * flags that might be subject to lockdown. Additionally we deliberately check
+ * the lockdown flags independently (even though read lockdown implies write
+ * lockdown) since that results in both simpler code and clearer messages to
+ * the user on first-time debugger entry.
+ *
+ * The permission masks during a read+write lockdown permits the following
+ * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE).
+ *
+ * The INSPECT commands are not blocked during lockdown because they are
+ * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes
+ * forcing them to have no arguments) and lsmod. These commands do expose
+ * some kernel state but do not allow the developer seated at the console to
+ * choose what state is reported. SIGNAL and REBOOT should not be controversial,
+ * given these are allowed for root during lockdown already.
+ */
+static void kdb_check_for_lockdown(void)
+{
+ const int write_flags = KDB_ENABLE_MEM_WRITE |
+ KDB_ENABLE_REG_WRITE |
+ KDB_ENABLE_FLOW_CTRL;
+ const int read_flags = KDB_ENABLE_MEM_READ |
+ KDB_ENABLE_REG_READ;
+
+ bool need_to_lockdown_write = false;
+ bool need_to_lockdown_read = false;
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags))
+ need_to_lockdown_write =
+ security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL);
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags))
+ need_to_lockdown_read =
+ security_locked_down(LOCKDOWN_DBG_READ_KERNEL);
+
+ /* De-compose KDB_ENABLE_ALL if required */
+ if (need_to_lockdown_write || need_to_lockdown_read)
+ if (kdb_cmd_enabled & KDB_ENABLE_ALL)
+ kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL;
+
+ if (need_to_lockdown_write)
+ kdb_cmd_enabled &= ~write_flags;
+
+ if (need_to_lockdown_read)
+ kdb_cmd_enabled &= ~read_flags;
+}
+
+/*
+ * Check whether the flags of the current command, the permissions of the kdb
+ * console and the lockdown state allow a command to be run.
*/
-static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
bool no_args)
{
/* permissions comes from userspace so needs massaging slightly */
@@ -1180,6 +1233,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_curr_task(raw_smp_processor_id());
KDB_DEBUG_STATE("kdb_local 1", reason);
+
+ kdb_check_for_lockdown();
+
kdb_go_count = 0;
if (reason == KDB_REASON_DEBUG) {
/* special case below */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index df2bface866e..85cb51c4a17e 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -291,7 +291,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
*/
int kdb_putarea_size(unsigned long addr, void *res, size_t size)
{
- int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
+ int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_func_printf("Bad address 0x%lx\n", addr);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1b02179758cb..56866aaa2ae1 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -110,15 +110,10 @@ config DMA_GLOBAL_POOL
select DMA_DECLARE_COHERENT
bool
-config DMA_REMAP
- bool
- depends on MMU
- select DMA_NONCOHERENT_MMAP
-
config DMA_DIRECT_REMAP
bool
- select DMA_REMAP
select DMA_COHERENT_POOL
+ select DMA_NONCOHERENT_MMAP
config DMA_CMA
bool "DMA Contiguous Memory Allocator"
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 0dd65ec1d234..21926e46ef4f 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -8,5 +8,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
-obj-$(CONFIG_DMA_REMAP) += remap.o
+obj-$(CONFIG_MMU) += remap.o
obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 7a14ca29c377..f8ff598596b8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -927,7 +927,7 @@ static __init int dma_debug_cmdline(char *str)
global_disable = true;
}
- return 0;
+ return 1;
}
static __init int dma_debug_entries_cmdline(char *str)
@@ -936,7 +936,7 @@ static __init int dma_debug_entries_cmdline(char *str)
return -EINVAL;
if (!get_option(&str, &nr_prealloc_entries))
nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
- return 0;
+ return 1;
}
__setup("dma_debug=", dma_debug_cmdline);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50f48e9e4598..9743c6ccce1a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -265,28 +265,28 @@ void *dma_direct_alloc(struct device *dev, size_t size,
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
+
+ /*
+ * dma_alloc_contiguous can return highmem pages depending on a
+ * combination the cma= arguments and per-arch setup. These need to be
+ * remapped to return a kernel virtual address.
+ */
if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup,
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here, so
- * log an error and fail.
- */
- if (!IS_ENABLED(CONFIG_DMA_REMAP)) {
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
remap = true;
set_uncached = false;
}
if (remap) {
+ pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
+
+ if (force_dma_unencrypted(dev))
+ prot = pgprot_decrypted(prot);
+
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
/* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, size,
- dma_pgprot(dev, PAGE_KERNEL, attrs),
+ ret = dma_common_contiguous_remap(page, size, prot,
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
@@ -349,7 +349,7 @@ void dma_direct_free(struct device *dev, size_t size,
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
return;
- if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+ if (is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
} else {
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
@@ -539,6 +539,8 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+ if (force_dma_unencrypted(dev))
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 4632b0f4f72e..8a6cd53dbe8c 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -114,6 +114,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
if (unlikely(is_swiotlb_buffer(dev, phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+ swiotlb_tbl_unmap_single(dev, phys, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 9b9af1bd6be3..0520a8f4fb1d 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -11,6 +11,7 @@
#include <linux/dma-mapping.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
+#include <linux/map_benchmark.h>
#include <linux/math64.h>
#include <linux/module.h>
#include <linux/pci.h>
@@ -18,30 +19,6 @@
#include <linux/slab.h>
#include <linux/timekeeping.h>
-#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
-#define DMA_MAP_MAX_THREADS 1024
-#define DMA_MAP_MAX_SECONDS 300
-#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC)
-
-#define DMA_MAP_BIDIRECTIONAL 0
-#define DMA_MAP_TO_DEVICE 1
-#define DMA_MAP_FROM_DEVICE 2
-
-struct map_benchmark {
- __u64 avg_map_100ns; /* average map latency in 100ns */
- __u64 map_stddev; /* standard deviation of map latency */
- __u64 avg_unmap_100ns; /* as above */
- __u64 unmap_stddev;
- __u32 threads; /* how many threads will do map/unmap in parallel */
- __u32 seconds; /* how long the test will last */
- __s32 node; /* which numa node this benchmark will run on */
- __u32 dma_bits; /* DMA addressing capability */
- __u32 dma_dir; /* DMA data direction */
- __u32 dma_trans_ns; /* time for DMA transmission in ns */
- __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */
- __u8 expansion[76]; /* For future use */
-};
-
struct map_benchmark_data {
struct map_benchmark bparam;
struct device *dev;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 9478eccd1c8e..db7244291b74 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -407,8 +407,6 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs);
*/
pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
{
- if (force_dma_unencrypted(dev))
- prot = pgprot_decrypted(prot);
if (dev_is_dma_coherent(dev))
return prot;
#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
@@ -745,7 +743,6 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
/*
@@ -761,7 +758,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
return 0;
}
EXPORT_SYMBOL(dma_set_coherent_mask);
-#endif
size_t dma_max_mapping_size(struct device *dev)
{
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 6db1c475ec82..73a41cec9e38 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,40 +21,33 @@
#define pr_fmt(fmt) "software IO TLB: " fmt
#include <linux/cache.h>
+#include <linux/cc_platform.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <linux/dma-direct.h>
#include <linux/dma-map-ops.h>
-#include <linux/mm.h>
#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/iommu-helper.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/scatterlist.h>
+#include <linux/set_memory.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/swiotlb.h>
-#include <linux/pfn.h>
#include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/scatterlist.h>
-#include <linux/cc_platform.h>
-#include <linux/set_memory.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
#ifdef CONFIG_DMA_RESTRICTED_POOL
-#include <linux/io.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/of_reserved_mem.h>
#include <linux/slab.h>
#endif
-#include <asm/io.h>
-#include <asm/dma.h>
-
-#include <linux/io.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/iommu-helper.h>
-
#define CREATE_TRACE_POINTS
#include <trace/events/swiotlb.h>
@@ -207,8 +200,6 @@ void __init swiotlb_update_mem_attributes(void)
mem->vaddr = swiotlb_mem_remap(mem, bytes);
if (!mem->vaddr)
mem->vaddr = vaddr;
-
- memset(mem->vaddr, 0, bytes);
}
static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
@@ -701,13 +692,10 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
{
- /*
- * Unconditional bounce is necessary to avoid corruption on
- * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite
- * the whole lengt of the bounce buffer.
- */
- swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
- BUG_ON(!valid_dma_direction(dir));
+ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+ swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+ else
+ BUG_ON(dir != DMA_FROM_DEVICE);
}
void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
@@ -766,47 +754,29 @@ bool is_swiotlb_active(struct device *dev)
}
EXPORT_SYMBOL_GPL(is_swiotlb_active);
-#ifdef CONFIG_DEBUG_FS
-static struct dentry *debugfs_dir;
-
-static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
{
+ mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
+ if (!mem->nslabs)
+ return;
+
debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
}
-static int __init swiotlb_create_default_debugfs(void)
+static int __init __maybe_unused swiotlb_create_default_debugfs(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
-
- debugfs_dir = debugfs_create_dir("swiotlb", NULL);
- if (mem->nslabs) {
- mem->debugfs = debugfs_dir;
- swiotlb_create_debugfs_files(mem);
- }
+ swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
return 0;
}
+#ifdef CONFIG_DEBUG_FS
late_initcall(swiotlb_create_default_debugfs);
-
#endif
#ifdef CONFIG_DMA_RESTRICTED_POOL
-#ifdef CONFIG_DEBUG_FS
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
- struct io_tlb_mem *mem = rmem->priv;
-
- mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir);
- swiotlb_create_debugfs_files(mem);
-}
-#else
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
-}
-#endif
-
struct page *swiotlb_alloc(struct device *dev, size_t size)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
@@ -853,8 +823,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
if (!mem)
return -ENOMEM;
- mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs),
- GFP_KERNEL);
+ mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL);
if (!mem->slots) {
kfree(mem);
return -ENOMEM;
@@ -868,7 +837,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
rmem->priv = mem;
- rmem_swiotlb_debugfs_init(rmem);
+ swiotlb_create_debugfs_files(mem, rmem->name);
}
dev->dma_io_tlb_mem = mem;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index ed10a95a6b1d..032f164abe7c 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,6 +2,7 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
+#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
#include <linux/jump_label.h>
#include <linux/livepatch.h>
@@ -16,7 +17,7 @@
/* See comment for enter_from_user_mode() in entry-common.h */
static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
{
- arch_check_user_regs(regs);
+ arch_enter_from_user_mode(regs);
lockdep_hardirqs_off(CALLER_ADDR0);
CT_WARN_ON(ct_state() != CONTEXT_USER);
@@ -59,7 +60,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Handle ptrace */
if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = arch_syscall_enter_tracehook(regs);
+ ret = ptrace_report_syscall_entry(regs);
if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
return -1L;
}
@@ -125,7 +126,7 @@ static __always_inline void __exit_to_user_mode(void)
{
instrumentation_begin();
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
instrumentation_end();
user_enter_irqoff();
@@ -139,27 +140,7 @@ void noinstr exit_to_user_mode(void)
}
/* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
-
-static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
-{
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
-
- arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
-}
-
-#ifdef CONFIG_RT_DELAYED_SIGNALS
-static inline void raise_delayed_signal(void)
-{
- if (unlikely(current->forced_info.si_signo)) {
- force_sig_info(&current->forced_info);
- current->forced_info.si_signo = 0;
- }
-}
-#else
-static inline void raise_delayed_signal(void) { }
-#endif
+void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
@@ -175,8 +156,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
if (ti_work & _TIF_NEED_RESCHED)
schedule();
- raise_delayed_signal();
-
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
@@ -184,10 +163,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
klp_update_patch_state(current);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
- handle_signal_work(regs, ti_work);
+ arch_do_signal_or_restart(regs);
if (ti_work & _TIF_NOTIFY_RESUME)
- tracehook_notify_resume(regs);
+ resume_user_mode_work(regs);
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
@@ -267,7 +246,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
step = report_single_step(work);
if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- arch_syscall_exit_tracehook(regs, step);
+ ptrace_report_syscall_exit(regs, step);
}
/*
@@ -413,7 +392,7 @@ DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
void dynamic_irqentry_exit_cond_resched(void)
{
- if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+ if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
return;
raw_irqentry_exit_cond_resched();
}
@@ -437,7 +416,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
instrumentation_end();
rcu_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
@@ -486,7 +465,7 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
ftrace_nmi_exit();
if (irq_state.lockdep) {
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
}
instrumentation_end();
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 96d476e06c77..9d09f489b60e 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,8 +8,11 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
do {
int ret;
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
+ clear_notify_signal();
+ if (task_work_pending(current))
+ task_work_run();
+ }
if (ti_work & _TIF_SIGPENDING) {
kvm_handle_signal_exit(vcpu);
@@ -20,7 +23,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
schedule();
if (ti_work & _TIF_NOTIFY_RESUME)
- tracehook_notify_resume(NULL);
+ resume_user_mode_work(NULL);
ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
if (ret)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 58cbe357fb2b..1273be84392c 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
}
if (regs) {
- mm_segment_t fs;
-
if (crosstask)
goto exit_put;
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = force_uaccess_begin();
perf_callchain_user(&ctx, regs);
- force_uaccess_end(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e1461f99881f..950b25c3f210 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -574,8 +574,7 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
enum event_type_t event_type);
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
+ enum event_type_t event_type);
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -781,7 +780,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
struct perf_cgroup_info *info;
- struct perf_cgroup *cgrp;
/*
* ensure we access cgroup data only when needed and
@@ -790,21 +788,19 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
if (!is_cgroup_event(event))
return;
- cgrp = perf_cgroup_from_task(current, event->ctx);
+ info = this_cpu_ptr(event->cgrp->info);
/*
* Do not update time when cgroup is not active
*/
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
- info = this_cpu_ptr(event->cgrp->info);
+ if (info->active)
__update_cgrp_time(info, perf_clock(), true);
- }
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
- struct perf_cgroup *cgrp;
+ struct perf_event_context *ctx = &cpuctx->ctx;
+ struct perf_cgroup *cgrp = cpuctx->cgrp;
struct perf_cgroup_info *info;
struct cgroup_subsys_state *css;
@@ -813,10 +809,10 @@ perf_cgroup_set_timestamp(struct task_struct *task,
* ensure we do not access cgroup data
* unless we have the cgroup pinned (css_get)
*/
- if (!task || !ctx->nr_cgroups)
+ if (!cgrp)
return;
- cgrp = perf_cgroup_from_task(task, ctx);
+ WARN_ON_ONCE(!ctx->nr_cgroups);
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
@@ -828,17 +824,12 @@ perf_cgroup_set_timestamp(struct task_struct *task,
static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
-#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
-
/*
* reschedule events based on the cgroup constraint of task.
- *
- * mode SWOUT : schedule out everything
- * mode SWIN : schedule in based on cgroup for next
*/
-static void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task)
{
+ struct perf_cgroup *cgrp;
struct perf_cpu_context *cpuctx, *tmp;
struct list_head *list;
unsigned long flags;
@@ -849,35 +840,31 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
*/
local_irq_save(flags);
+ cgrp = perf_cgroup_from_task(task, NULL);
+
list = this_cpu_ptr(&cgrp_cpuctx_list);
list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ continue;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
- if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to event_filter_match() in event_sched_out()
- */
- cpuctx->cgrp = NULL;
- }
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
+ */
+ cpuctx->cgrp = cgrp;
+ /*
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
+ */
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL);
- if (mode & PERF_CGROUP_SWIN) {
- WARN_ON_ONCE(cpuctx->cgrp);
- /*
- * set cgrp before ctxsw in to allow
- * event_filter_match() to not have to pass
- * task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgorup events are only per-cpu
- */
- cpuctx->cgrp = perf_cgroup_from_task(task,
- &cpuctx->ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
- }
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
@@ -885,58 +872,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
local_irq_restore(flags);
}
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
-
- rcu_read_lock();
- /*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
- */
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(next, NULL);
-
- /*
- * only schedule out current cgroup events if we know
- * that we are switching to a different cgroup. Otherwise,
- * do no touch the cgroup events.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
-
- rcu_read_unlock();
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
-
- rcu_read_lock();
- /*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
- */
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(prev, NULL);
-
- /*
- * only need to schedule in cgroup events if we are changing
- * cgroup during ctxsw. Cgroup events were not scheduled
- * out of ctxsw out if that was not the case.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWIN);
-
- rcu_read_unlock();
-}
-
static int perf_cgroup_ensure_storage(struct perf_event *event,
struct cgroup_subsys_state *css)
{
@@ -1032,22 +967,10 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
*/
cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
- /*
- * Since setting cpuctx->cgrp is conditional on the current @cgrp
- * matching the event's cgroup, we must do this for every new event,
- * because if the first would mismatch, the second would not try again
- * and we would leave cpuctx->cgrp unset.
- */
- if (ctx->is_active && !cpuctx->cgrp) {
- struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
-
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- cpuctx->cgrp = cgrp;
- }
-
if (ctx->nr_cgroups++)
return;
+ cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
list_add(&cpuctx->cgrp_cpuctx_entry,
per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
}
@@ -1069,9 +992,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (--ctx->nr_cgroups)
return;
- if (ctx->is_active && cpuctx->cgrp)
- cpuctx->cgrp = NULL;
-
+ cpuctx->cgrp = NULL;
list_del(&cpuctx->cgrp_cpuctx_entry);
}
@@ -1100,16 +1021,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
{
}
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
-{
-}
-
static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
@@ -1118,13 +1029,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
-{
-}
-
-static inline void
-perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
}
@@ -1147,6 +1052,10 @@ static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+
+static void perf_cgroup_switch(struct task_struct *task)
+{
+}
#endif
/*
@@ -2713,8 +2622,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
+ enum event_type_t event_type);
static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
@@ -2730,15 +2638,14 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- struct task_struct *task)
+ struct perf_event_context *ctx)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
}
/*
@@ -2788,7 +2695,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
else if (ctx_event_type & EVENT_PINNED)
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, task_ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx);
perf_pmu_enable(cpuctx->ctx.pmu);
}
@@ -3011,7 +2918,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (!event_filter_match(event)) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME);
return;
}
@@ -3020,7 +2927,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME);
return;
}
@@ -3668,7 +3575,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
* cgroup event are system-wide mode only
*/
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_out(task, next);
+ perf_cgroup_switch(next);
}
/*
@@ -3865,8 +3772,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+ enum event_type_t event_type)
{
int is_active = ctx->is_active;
@@ -3878,7 +3784,7 @@ ctx_sched_in(struct perf_event_context *ctx,
if (is_active ^ EVENT_TIME) {
/* start ctx time */
__update_context_time(ctx, false);
- perf_cgroup_set_timestamp(task, ctx);
+ perf_cgroup_set_timestamp(cpuctx);
/*
* CPU-release for the below ->is_active store,
* see __load_acquire() in perf_event_time_now()
@@ -3909,12 +3815,11 @@ ctx_sched_in(struct perf_event_context *ctx,
}
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+ enum event_type_t event_type)
{
struct perf_event_context *ctx = &cpuctx->ctx;
- ctx_sched_in(ctx, cpuctx, event_type, task);
+ ctx_sched_in(ctx, cpuctx, event_type);
}
static void perf_event_context_sched_in(struct perf_event_context *ctx,
@@ -3956,7 +3861,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, ctx, task);
+ perf_event_sched_in(cpuctx, ctx);
if (cpuctx->sched_cb_usage && pmu->sched_task)
pmu->sched_task(cpuctx->task_ctx, true);
@@ -3984,16 +3889,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
struct perf_event_context *ctx;
int ctxn;
- /*
- * If cgroup events exist on this CPU, then we need to check if we have
- * to switch in PMU state; cgroup event are system-wide mode only.
- *
- * Since cgroup events are CPU events, we must schedule these in before
- * we schedule in the task events.
- */
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_in(prev, task);
-
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (likely(!ctx))
@@ -4267,7 +4162,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
if (cpu_event)
rotate_ctx(&cpuctx->ctx, cpu_event);
- perf_event_sched_in(cpuctx, task_ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4339,7 +4234,7 @@ static void perf_event_enable_on_exec(int ctxn)
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -6352,7 +6247,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
- if (event->rb->nr_pages != nr_pages) {
+ if (data_page_nr(event->rb) != nr_pages) {
ret = -EINVAL;
goto unlock;
}
@@ -6533,8 +6428,8 @@ static void perf_sigtrap(struct perf_event *event)
if (current->flags & PF_EXITING)
return;
- force_sig_perf((void __user *)event->pending_addr,
- event->attr.type, event->attr.sig_data);
+ send_sig_perf((void __user *)event->pending_addr,
+ event->attr.type, event->attr.sig_data);
}
static void perf_pending_event_disable(struct perf_event *event)
@@ -6746,7 +6641,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
unsigned long sp;
unsigned int rem;
u64 dyn_size;
- mm_segment_t fs;
/*
* We dump:
@@ -6764,9 +6658,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -11638,6 +11530,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->state = PERF_EVENT_STATE_INACTIVE;
+ if (parent_event)
+ event->event_caps = parent_event->event_caps;
+
if (event->attr.sigtrap)
atomic_set(&event->event_limit, 1);
@@ -12322,6 +12217,9 @@ SYSCALL_DEFINE5(perf_event_open,
* Do not allow to attach to a group in a different task
* or CPU context. If we're moving SW events, we'll fix
* this up later, so allow that.
+ *
+ * Racy, not holding group_leader->ctx->mutex, see comment with
+ * perf_event_ctx_lock().
*/
if (!move_group && group_leader->ctx != ctx)
goto err_context;
@@ -12387,6 +12285,7 @@ SYSCALL_DEFINE5(perf_event_open,
} else {
perf_event_ctx_unlock(group_leader, gctx);
move_group = 0;
+ goto not_move_group;
}
}
@@ -12403,7 +12302,17 @@ SYSCALL_DEFINE5(perf_event_open,
}
} else {
mutex_lock(&ctx->mutex);
+
+ /*
+ * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
+ * see the group_leader && !move_group test earlier.
+ */
+ if (group_leader && group_leader->ctx != ctx) {
+ err = -EINVAL;
+ goto err_locked;
+ }
}
+not_move_group:
if (ctx->task == TASK_TOMBSTONE) {
err = -ESRCH;
@@ -13565,7 +13474,7 @@ static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
rcu_read_lock();
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+ perf_cgroup_switch(task);
rcu_read_unlock();
return 0;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 082832738c8f..5150d5f84c03 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb)
}
#endif
+static inline int data_page_nr(struct perf_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
+
static inline unsigned long perf_data_size(struct perf_buffer *rb)
{
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 52868716ec35..fb35b926024c 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -859,11 +859,6 @@ void rb_free(struct perf_buffer *rb)
}
#else
-static int data_page_nr(struct perf_buffer *rb)
-{
- return rb->nr_pages << page_order(rb);
-}
-
static struct page *
__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6418083901d4..a9bc3c98f76a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -787,10 +787,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
struct page *page;
/*
* Ensure that the page that has the original instruction is populated
- * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
* see uprobe_register().
*/
- if (mapping->a_ops->readpage)
+ if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
else
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
@@ -1143,7 +1143,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
return -EINVAL;
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
- if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ if (!inode->i_mapping->a_ops->read_folio &&
+ !shmem_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
diff --git a/kernel/exit.c b/kernel/exit.c
index 192b90a9ce16..f072959fcab7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,7 +49,8 @@
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
@@ -64,6 +65,7 @@
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
+#include <linux/rethook.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -169,6 +171,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
kprobe_flush_task(tsk);
+ rethook_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
@@ -737,20 +740,6 @@ void __noreturn do_exit(long code)
WARN_ON(tsk->plug);
- /*
- * If do_dead is called because this processes oopsed, it's possible
- * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
- * continuing. Amongst other possible reasons, this is to prevent
- * mm_release()->clear_child_tid() from writing to a user-controlled
- * kernel address.
- *
- * On uptodate architectures force_uaccess_begin is a noop. On
- * architectures that still have set_fs/get_fs in addition to handling
- * oopses handles kernel threads that run as set_fs(KERNEL_DS) by
- * default.
- */
- force_uaccess_begin();
-
kcov_task_exit(tsk);
coredump_task_exit(tsk);
@@ -907,7 +896,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-void
+void __noreturn
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
diff --git a/kernel/extable.c b/kernel/extable.c
index b6f330f0fe74..bda5e9761541 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -3,6 +3,7 @@
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
*/
+#include <linux/elf.h>
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/extable.h>
@@ -132,12 +133,33 @@ out:
}
/*
- * On some architectures (PPC64, IA64) function pointers
+ * On some architectures (PPC64, IA64, PARISC) function pointers
* are actually only tokens to some data that then holds the
* real function address. As a result, to find if a function
* pointer is part of the kernel text, we need to do some
* special dereferencing first.
*/
+#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS
+void *dereference_function_descriptor(void *ptr)
+{
+ func_desc_t *desc = ptr;
+ void *p;
+
+ if (!get_kernel_nofault(p, (void *)&desc->addr))
+ ptr = p;
+ return ptr;
+}
+EXPORT_SYMBOL_GPL(dereference_function_descriptor);
+
+void *dereference_kernel_function_descriptor(void *ptr)
+{
+ if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd)
+ return ptr;
+
+ return dereference_function_descriptor(ptr);
+}
+#endif
+
int func_ptr_is_kernel_text(void *ptr)
{
unsigned long addr;
diff --git a/kernel/fork.c b/kernel/fork.c
index d74c30b6733b..254ab63c1106 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -286,11 +286,13 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
- /* Mark stack accessible for KASAN. */
+ /* Reset stack metadata. */
kasan_unpoison_range(s->addr, THREAD_SIZE);
+ stack = kasan_reset_tag(s->addr);
+
/* Clear stale pointers from reused stack. */
- memset(s->addr, 0, THREAD_SIZE);
+ memset(stack, 0, THREAD_SIZE);
if (memcg_charge_kernel_stack(s)) {
vfree(s->addr);
@@ -298,7 +300,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
}
tsk->stack_vm_area = s;
- tsk->stack = s->addr;
+ tsk->stack = stack;
return 0;
}
@@ -326,6 +328,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
* so cache the vm_struct.
*/
tsk->stack_vm_area = vm;
+ stack = kasan_reset_tag(stack);
tsk->stack = stack;
return 0;
}
@@ -789,6 +792,7 @@ void __mmdrop(struct mm_struct *mm)
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
+ mm_pasid_drop(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1042,6 +1046,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
+
+#ifdef CONFIG_CPU_SUP_INTEL
+ tsk->reported_split_lock = 0;
+#endif
+
return tsk;
free_stack:
@@ -1187,7 +1196,6 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
- mm_pasid_drop(mm);
mmdrop(mm);
}
@@ -2322,6 +2330,9 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
#endif
+#ifdef CONFIG_RETHOOK
+ p->rethooks.first = NULL;
+#endif
/*
* Ensure that the cgroup subsystem policies allow the new process to be
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 183b28c32c83..ce2889f12375 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1005,7 +1005,7 @@ retry_private:
rt_mutex_init_waiter(&rt_waiter);
/*
- * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+ * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
* hold it while doing rt_mutex_start_proxy(), because then it will
* include hb->lock in the blocking chain, even through we'll not in
* fact hold it while blocking. This will lead it to report -EDEADLK
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f7ff8919dc9b..d9a5c1d65a79 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -258,7 +258,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
nodemask_t nodemsk = NODE_MASK_NONE;
struct node_vectors *node_vectors;
- if (!cpumask_weight(cpu_mask))
+ if (cpumask_empty(cpu_mask))
return 0;
nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
@@ -269,8 +269,9 @@ static int __irq_build_affinity_masks(unsigned int startvec,
*/
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
- node_to_cpumask[n]);
+ /* Ensure that only CPUs which are in both masks are set */
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+ cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
if (++curvec == last_affv)
curvec = firstvec;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 54af0deb239b..e6b8e564b37f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1573,17 +1573,12 @@ static struct device *irq_get_parent_device(struct irq_data *data)
int irq_chip_pm_get(struct irq_data *data)
{
struct device *dev = irq_get_parent_device(data);
- int retval;
+ int retval = 0;
- if (IS_ENABLED(CONFIG_PM) && dev) {
- retval = pm_runtime_get_sync(dev);
- if (retval < 0) {
- pm_runtime_put_noidle(dev);
- return retval;
- }
- }
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_resume_and_get(dev);
- return 0;
+ return retval;
}
/**
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 2b43f5f5033d..bc8e40cf2b65 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -58,6 +58,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND),
+ BIT_MASK_DESCR(IRQCHIP_IMMUTABLE),
};
static void
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 99cbdf55a8bd..f09c60393e55 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -29,12 +29,14 @@ extern struct irqaction chained_action;
* IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
* IRQTF_AFFINITY - irq thread is requested to adjust affinity
* IRQTF_FORCED_THREAD - irq action is force threaded
+ * IRQTF_READY - signals that irq thread is ready
*/
enum {
IRQTF_RUNTHREAD,
IRQTF_WARNED,
IRQTF_AFFINITY,
IRQTF_FORCED_THREAD,
+ IRQTF_READY,
};
/*
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 0cd02efa3a74..dd76323ea3fd 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -181,7 +181,7 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
goto err_free_bitmap;
work_ctx->irq_count = num_irqs;
- init_irq_work(&work_ctx->work, irq_sim_handle_irq);
+ work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq);
return work_ctx->domain;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 939d21cd55c3..d323b180b0f3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -407,6 +407,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
mutex_init(&desc->request_mutex);
init_rcu_head(&desc->rcu);
+ init_waitqueue_head(&desc->wait_for_threads);
desc_set_defaults(irq, desc, node, affinity, owner);
irqd_set(&desc->irq_data, flags);
@@ -575,6 +576,7 @@ int __init early_irq_init(void)
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
mutex_init(&desc[i].request_mutex);
+ init_waitqueue_head(&desc[i].wait_for_threads);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
@@ -699,7 +701,6 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
*/
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
- WARN_ON_ONCE(!in_hardirq());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c03f71d5ec10..8c396319d5ac 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -222,11 +222,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
{
struct irq_desc *desc = irq_data_to_desc(data);
struct irq_chip *chip = irq_data_get_irq_chip(data);
+ const struct cpumask *prog_mask;
int ret;
+ static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
+ static struct cpumask tmp_mask;
+
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
+ raw_spin_lock(&tmp_mask_lock);
/*
* If this is a managed interrupt and housekeeping is enabled on
* it check whether the requested affinity mask intersects with
@@ -248,24 +253,34 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
*/
if (irqd_affinity_is_managed(data) &&
housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
- const struct cpumask *hk_mask, *prog_mask;
-
- static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
- static struct cpumask tmp_mask;
+ const struct cpumask *hk_mask;
hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
- raw_spin_lock(&tmp_mask_lock);
cpumask_and(&tmp_mask, mask, hk_mask);
if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
prog_mask = mask;
else
prog_mask = &tmp_mask;
- ret = chip->irq_set_affinity(data, prog_mask, force);
- raw_spin_unlock(&tmp_mask_lock);
} else {
- ret = chip->irq_set_affinity(data, mask, force);
+ prog_mask = mask;
}
+
+ /*
+ * Make sure we only provide online CPUs to the irqchip,
+ * unless we are being asked to force the affinity (in which
+ * case we do as we are told).
+ */
+ cpumask_and(&tmp_mask, prog_mask, cpu_online_mask);
+ if (!force && !cpumask_empty(&tmp_mask))
+ ret = chip->irq_set_affinity(data, &tmp_mask, force);
+ else if (force)
+ ret = chip->irq_set_affinity(data, mask, force);
+ else
+ ret = -EINVAL;
+
+ raw_spin_unlock(&tmp_mask_lock);
+
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
@@ -1249,6 +1264,31 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
}
/*
+ * Internal function to notify that a interrupt thread is ready.
+ */
+static void irq_thread_set_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ set_bit(IRQTF_READY, &action->thread_flags);
+ wake_up(&desc->wait_for_threads);
+}
+
+/*
+ * Internal function to wake up a interrupt thread and wait until it is
+ * ready.
+ */
+static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ if (!action || !action->thread)
+ return;
+
+ wake_up_process(action->thread);
+ wait_event(desc->wait_for_threads,
+ test_bit(IRQTF_READY, &action->thread_flags));
+}
+
+/*
* Interrupt handler thread
*/
static int irq_thread(void *data)
@@ -1259,6 +1299,8 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
+ irq_thread_set_ready(desc, action);
+
sched_set_fifo(current);
if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
@@ -1683,8 +1725,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!shared) {
- init_waitqueue_head(&desc->wait_for_threads);
-
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc,
@@ -1780,14 +1820,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
irq_setup_timings(desc, new);
- /*
- * Strictly no need to wake it up, but hung_task complains
- * when no hard interrupt wakes the thread up.
- */
- if (new->thread)
- wake_up_process(new->thread);
- if (new->secondary)
- wake_up_process(new->secondary->thread);
+ wake_up_and_wait_for_irq_thread_ready(desc, new);
+ wake_up_and_wait_for_irq_thread_ready(desc, new->secondary);
register_irq_proc(irq, desc);
new->dir = NULL;
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index bbfb26489aa1..1698e77645ac 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -286,7 +286,7 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
unsigned int *mapped_cpu)
{
- unsigned int bit, cpu, end = m->alloc_end;
+ unsigned int bit, cpu, end;
struct cpumap *cm;
if (cpumask_empty(msk))
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2bdfce5edafd..a9ee535293eb 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -818,6 +818,21 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
irqd_clr_can_reserve(irqd);
if (vflags & VIRQ_NOMASK_QUIRK)
irqd_set_msi_nomask_quirk(irqd);
+
+ /*
+ * If the interrupt is managed but no CPU is available to
+ * service it, shut it down until better times. Note that
+ * we only do this on the !RESERVE path as x86 (the only
+ * architecture using this flag) deals with this in a
+ * different way by using a catch-all vector.
+ */
+ if ((vflags & VIRQ_ACTIVATE) &&
+ irqd_affinity_is_managed(irqd) &&
+ !cpumask_intersects(irq_data_get_affinity_mask(irqd),
+ cpu_online_mask)) {
+ irqd_set_managed_shutdown(irqd);
+ return 0;
+ }
}
if (!(vflags & VIRQ_ACTIVATE))
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f7df715ec28e..7afa40fe5cc4 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -137,7 +137,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (!irq_work_claim(work))
return false;
- kasan_record_aux_stack(work);
+ kasan_record_aux_stack_noalloc(work);
preempt_disable();
if (cpu != smp_processor_id()) {
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 951c93216fc4..79f2eb617a62 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -212,6 +212,10 @@ unsigned long kallsyms_lookup_name(const char *name)
unsigned long i;
unsigned int off;
+ /* Skip the search for empty string. */
+ if (!*name)
+ return 0;
+
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 36ca640c4f8e..b3732b210593 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -459,37 +459,31 @@ void kcov_task_exit(struct task_struct *t)
static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
{
int res = 0;
- void *area;
struct kcov *kcov = vma->vm_file->private_data;
unsigned long size, off;
struct page *page;
unsigned long flags;
- area = vmalloc_user(vma->vm_end - vma->vm_start);
- if (!area)
- return -ENOMEM;
-
spin_lock_irqsave(&kcov->lock, flags);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
+ if (kcov->area == NULL || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
}
- if (!kcov->area) {
- kcov->area = area;
- vma->vm_flags |= VM_DONTEXPAND;
- spin_unlock_irqrestore(&kcov->lock, flags);
- for (off = 0; off < size; off += PAGE_SIZE) {
- page = vmalloc_to_page(kcov->area + off);
- if (vm_insert_page(vma, vma->vm_start + off, page))
- WARN_ONCE(1, "vm_insert_page() failed");
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vma->vm_flags |= VM_DONTEXPAND;
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ res = vm_insert_page(vma, vma->vm_start + off, page);
+ if (res) {
+ pr_warn_once("kcov: vm_insert_page() failed\n");
+ return res;
}
- return 0;
}
+ return 0;
exit:
spin_unlock_irqrestore(&kcov->lock, flags);
- vfree(area);
return res;
}
@@ -564,31 +558,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
unsigned long arg)
{
struct task_struct *t;
- unsigned long size, unused;
+ unsigned long flags, unused;
int mode, i;
struct kcov_remote_arg *remote_arg;
struct kcov_remote *remote;
- unsigned long flags;
switch (cmd) {
- case KCOV_INIT_TRACE:
- /*
- * Enable kcov in trace mode and setup buffer size.
- * Must happen before anything else.
- */
- if (kcov->mode != KCOV_MODE_DISABLED)
- return -EBUSY;
- /*
- * Size must be at least 2 to hold current position and one PC.
- * Later we allocate size * sizeof(unsigned long) memory,
- * that must not overflow.
- */
- size = arg;
- if (size < 2 || size > INT_MAX / sizeof(unsigned long))
- return -EINVAL;
- kcov->size = size;
- kcov->mode = KCOV_MODE_INIT;
- return 0;
case KCOV_ENABLE:
/*
* Enable coverage for the current task.
@@ -692,9 +667,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
struct kcov_remote_arg *remote_arg = NULL;
unsigned int remote_num_handles;
unsigned long remote_arg_size;
- unsigned long flags;
+ unsigned long size, flags;
+ void *area;
- if (cmd == KCOV_REMOTE_ENABLE) {
+ kcov = filep->private_data;
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ *
+ * First check the size argument - it must be at least 2
+ * to hold the current position and one PC.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ area = vmalloc_user(size * sizeof(unsigned long));
+ if (area == NULL)
+ return -ENOMEM;
+ spin_lock_irqsave(&kcov->lock, flags);
+ if (kcov->mode != KCOV_MODE_DISABLED) {
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vfree(area);
+ return -EBUSY;
+ }
+ kcov->area = area;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_INIT;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ return 0;
+ case KCOV_REMOTE_ENABLE:
if (get_user(remote_num_handles, (unsigned __user *)(arg +
offsetof(struct kcov_remote_arg, num_handles))))
return -EFAULT;
@@ -710,16 +713,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
return -EINVAL;
}
arg = (unsigned long)remote_arg;
+ fallthrough;
+ default:
+ /*
+ * All other commands can be normally executed under a spin lock, so we
+ * obtain and release it here in order to simplify kcov_ioctl_locked().
+ */
+ spin_lock_irqsave(&kcov->lock, flags);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ kfree(remote_arg);
+ return res;
}
-
- kcov = filep->private_data;
- spin_lock_irqsave(&kcov->lock, flags);
- res = kcov_ioctl_locked(kcov, cmd, arg);
- spin_unlock_irqrestore(&kcov->lock, flags);
-
- kfree(remote_arg);
-
- return res;
}
static const struct file_operations kcov_fops = {
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index a36fca063a73..767dfacd6ed3 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -1380,13 +1380,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
else
nthreads *= 2;
- if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+ if (!preempt_model_preemptible() ||
+ !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
/*
* Without any preemption, keep 2 CPUs free for other tasks, one
* of which is the main test case function checking for
* completion or failure.
*/
- const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0;
+ const long min_unused_cpus = preempt_model_none() ? 2 : 0;
const long min_required_cpus = 2 + min_unused_cpus;
if (num_online_cpus() < min_required_cpus) {
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 68480f731192..be4b54c2c615 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1078,7 +1078,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
return;
memset(&prstatus, 0, sizeof(prstatus));
prstatus.common.pr_pid = current->pid;
- elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+ elf_core_copy_regs(&prstatus.pr_reg, regs);
buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
&prstatus, sizeof(prstatus));
final_note(buf);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 94cab8c9ce56..dd58c0be9ce2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1237,6 +1237,27 @@ void kprobes_inc_nmissed_count(struct kprobe *p)
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
+static struct kprobe kprobe_busy = {
+ .addr = (void *) get_kprobe,
+};
+
+void kprobe_busy_begin(void)
+{
+ struct kprobe_ctlblk *kcb;
+
+ preempt_disable();
+ __this_cpu_write(current_kprobe, &kprobe_busy);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+}
+
+void kprobe_busy_end(void)
+{
+ __this_cpu_write(current_kprobe, NULL);
+ preempt_enable();
+}
+
+#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
static void free_rp_inst_rcu(struct rcu_head *head)
{
struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
@@ -1258,26 +1279,6 @@ static void recycle_rp_inst(struct kretprobe_instance *ri)
}
NOKPROBE_SYMBOL(recycle_rp_inst);
-static struct kprobe kprobe_busy = {
- .addr = (void *) get_kprobe,
-};
-
-void kprobe_busy_begin(void)
-{
- struct kprobe_ctlblk *kcb;
-
- preempt_disable();
- __this_cpu_write(current_kprobe, &kprobe_busy);
- kcb = get_kprobe_ctlblk();
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-}
-
-void kprobe_busy_end(void)
-{
- __this_cpu_write(current_kprobe, NULL);
- preempt_enable();
-}
-
/*
* This function is called from delayed_put_task_struct() when a task is
* dead and cleaned up to recycle any kretprobe instances associated with
@@ -1327,6 +1328,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
rp->rph = NULL;
}
}
+#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
/* Add the new probe to 'ap->list'. */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
@@ -1489,24 +1491,68 @@ bool within_kprobe_blacklist(unsigned long addr)
}
/*
+ * arch_adjust_kprobe_addr - adjust the address
+ * @addr: symbol base address
+ * @offset: offset within the symbol
+ * @on_func_entry: was this @addr+@offset on the function entry
+ *
+ * Typically returns @addr + @offset, except for special cases where the
+ * function might be prefixed by a CFI landing pad, in that case any offset
+ * inside the landing pad is mapped to the first 'real' instruction of the
+ * symbol.
+ *
+ * Specifically, for things like IBT/BTI, skip the resp. ENDBR/BTI.C
+ * instruction at +0.
+ */
+kprobe_opcode_t *__weak arch_adjust_kprobe_addr(unsigned long addr,
+ unsigned long offset,
+ bool *on_func_entry)
+{
+ *on_func_entry = !offset;
+ return (kprobe_opcode_t *)(addr + offset);
+}
+
+/*
* If 'symbol_name' is specified, look it up and add the 'offset'
* to it. This way, we can specify a relative address to a symbol.
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
- const char *symbol_name, unsigned int offset)
+static kprobe_opcode_t *
+_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name,
+ unsigned long offset, bool *on_func_entry)
{
if ((symbol_name && addr) || (!symbol_name && !addr))
goto invalid;
if (symbol_name) {
+ /*
+ * Input: @sym + @offset
+ * Output: @addr + @offset
+ *
+ * NOTE: kprobe_lookup_name() does *NOT* fold the offset
+ * argument into it's output!
+ */
addr = kprobe_lookup_name(symbol_name, offset);
if (!addr)
return ERR_PTR(-ENOENT);
}
- addr = (kprobe_opcode_t *)(((char *)addr) + offset);
+ /*
+ * So here we have @addr + @offset, displace it into a new
+ * @addr' + @offset' where @addr' is the symbol start address.
+ */
+ addr = (void *)addr + offset;
+ if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
+ return ERR_PTR(-ENOENT);
+ addr = (void *)addr - offset;
+
+ /*
+ * Then ask the architecture to re-combine them, taking care of
+ * magical function entry details while telling us if this was indeed
+ * at the start of the function.
+ */
+ addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry);
if (addr)
return addr;
@@ -1516,7 +1562,8 @@ invalid:
static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
- return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+ bool on_func_entry;
+ return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
}
/*
@@ -1562,14 +1609,10 @@ static inline int warn_kprobe_rereg(struct kprobe *p)
static int check_ftrace_location(struct kprobe *p)
{
- unsigned long ftrace_addr;
+ unsigned long addr = (unsigned long)p->addr;
- ftrace_addr = ftrace_location((unsigned long)p->addr);
- if (ftrace_addr) {
+ if (ftrace_location(addr) == addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
- /* Given address is not on the instruction boundary */
- if ((unsigned long)p->addr != ftrace_addr)
- return -EILSEQ;
p->flags |= KPROBE_FLAG_FTRACE;
#else /* !CONFIG_KPROBES_ON_FTRACE */
return -EINVAL;
@@ -1884,6 +1927,7 @@ static struct notifier_block kprobe_exceptions_nb = {
#ifdef CONFIG_KRETPROBES
+#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
/* This assumes the 'tsk' is the current task or the is not running. */
static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
struct llist_node **cur)
@@ -2046,11 +2090,57 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
+#else /* CONFIG_KRETPROBE_ON_RETHOOK */
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ struct kretprobe_instance *ri;
+ struct rethook_node *rhn;
+
+ rhn = rethook_try_get(rp->rh);
+ if (!rhn) {
+ rp->nmissed++;
+ return 0;
+ }
+
+ ri = container_of(rhn, struct kretprobe_instance, node);
+
+ if (rp->entry_handler && rp->entry_handler(ri, regs))
+ rethook_recycle(rhn);
+ else
+ rethook_hook(rhn, regs, kprobe_ftrace(p));
+
+ return 0;
+}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
-bool __weak arch_kprobe_on_func_entry(unsigned long offset)
+static void kretprobe_rethook_handler(struct rethook_node *rh, void *data,
+ struct pt_regs *regs)
{
- return !offset;
+ struct kretprobe *rp = (struct kretprobe *)data;
+ struct kretprobe_instance *ri;
+ struct kprobe_ctlblk *kcb;
+
+ /* The data must NOT be null. This means rethook data structure is broken. */
+ if (WARN_ON_ONCE(!data) || !rp->handler)
+ return;
+
+ __this_cpu_write(current_kprobe, &rp->kp);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ ri = container_of(rh, struct kretprobe_instance, node);
+ rp->handler(ri, regs);
+
+ __this_cpu_write(current_kprobe, NULL);
}
+NOKPROBE_SYMBOL(kretprobe_rethook_handler);
+
+#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
/**
* kprobe_on_func_entry() -- check whether given address is function entry
@@ -2067,15 +2157,13 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset)
*/
int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
- kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+ bool on_func_entry;
+ kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset, &on_func_entry);
if (IS_ERR(kp_addr))
return PTR_ERR(kp_addr);
- if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
- return -ENOENT;
-
- if (!arch_kprobe_on_func_entry(offset))
+ if (!on_func_entry)
return -EINVAL;
return 0;
@@ -2121,6 +2209,29 @@ int register_kretprobe(struct kretprobe *rp)
rp->maxactive = num_possible_cpus();
#endif
}
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
+ if (!rp->rh)
+ return -ENOMEM;
+
+ for (i = 0; i < rp->maxactive; i++) {
+ inst = kzalloc(sizeof(struct kretprobe_instance) +
+ rp->data_size, GFP_KERNEL);
+ if (inst == NULL) {
+ rethook_free(rp->rh);
+ rp->rh = NULL;
+ return -ENOMEM;
+ }
+ rethook_add_node(rp->rh, &inst->node);
+ }
+ rp->nmissed = 0;
+ /* Establish function entry probe point */
+ ret = register_kprobe(&rp->kp);
+ if (ret != 0) {
+ rethook_free(rp->rh);
+ rp->rh = NULL;
+ }
+#else /* !CONFIG_KRETPROBE_ON_RETHOOK */
rp->freelist.head = NULL;
rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
if (!rp->rph)
@@ -2145,6 +2256,7 @@ int register_kretprobe(struct kretprobe *rp)
ret = register_kprobe(&rp->kp);
if (ret != 0)
free_rp_inst(rp);
+#endif
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
@@ -2183,7 +2295,11 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
for (i = 0; i < num; i++) {
if (__unregister_kprobe_top(&rps[i]->kp) < 0)
rps[i]->kp.addr = NULL;
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rethook_free(rps[i]->rh);
+#else
rps[i]->rph->rp = NULL;
+#endif
}
mutex_unlock(&kprobe_mutex);
@@ -2191,7 +2307,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
for (i = 0; i < num; i++) {
if (rps[i]->kp.addr) {
__unregister_kprobe_bottom(&rps[i]->kp);
+#ifndef CONFIG_KRETPROBE_ON_RETHOOK
free_rp_inst(rps[i]);
+#endif
}
}
}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 35859da8bd4f..b1292a57c2a5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,8 +24,7 @@
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
-static struct kobj_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kobject *kobj,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index d100d5a15b38..544fd4097406 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -55,7 +55,6 @@ struct kthread {
int result;
int (*threadfn)(void *);
void *data;
- mm_segment_t oldfs;
struct completion parked;
struct completion exited;
#ifdef CONFIG_BLK_CGROUP
@@ -1441,8 +1440,6 @@ void kthread_use_mm(struct mm_struct *mm)
mmdrop(active_mm);
else
smp_mb();
-
- to_kthread(tsk)->oldfs = force_uaccess_begin();
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1457,8 +1454,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
- force_uaccess_end(to_kthread(tsk)->oldfs);
-
task_lock(tsk);
/*
* When a kthread stops operating on an address space, the loop
@@ -1527,5 +1522,4 @@ struct cgroup_subsys_state *kthread_blkcg(void)
}
return NULL;
}
-EXPORT_SYMBOL(kthread_blkcg);
#endif
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 585494ec464f..bc475e62279d 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -190,7 +190,7 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
+static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
unsigned int symndx, Elf_Shdr *relasec,
const char *sec_objname)
{
@@ -218,7 +218,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
relas = (Elf_Rela *) relasec->sh_addr;
/* For each rela in this klp relocation section */
for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
- sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
+ sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
if (sym->st_shndx != SHN_LIVEPATCH) {
pr_err("symbol %s is not marked as a livepatch symbol\n",
strtab + sym->st_name);
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index fe316c021d73..c172bf92b576 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -124,19 +124,6 @@ unlock:
ftrace_test_recursion_unlock(bit);
}
-/*
- * Convert a function address into the appropriate ftrace location.
- *
- * Usually this is just the address of the function, but on some architectures
- * it's more complicated so allow them to provide a custom behaviour.
- */
-#ifndef klp_get_ftrace_location
-static unsigned long klp_get_ftrace_location(unsigned long faddr)
-{
- return faddr;
-}
-#endif
-
static void klp_unpatch_func(struct klp_func *func)
{
struct klp_ops *ops;
@@ -153,8 +140,7 @@ static void klp_unpatch_func(struct klp_func *func)
if (list_is_singular(&ops->func_stack)) {
unsigned long ftrace_loc;
- ftrace_loc =
- klp_get_ftrace_location((unsigned long)func->old_func);
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
if (WARN_ON(!ftrace_loc))
return;
@@ -186,8 +172,7 @@ static int klp_patch_func(struct klp_func *func)
if (!ops) {
unsigned long ftrace_loc;
- ftrace_loc =
- klp_get_ftrace_location((unsigned long)func->old_func);
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
if (!ftrace_loc) {
pr_err("failed to find location for function '%s'\n",
func->old_name);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5683ac0d2566..5d03a2ad1066 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,7 +9,6 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
-#include <linux/tracehook.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
@@ -641,6 +640,13 @@ void klp_force_transition(void)
for_each_possible_cpu(cpu)
klp_update_patch_state(idle_task(cpu));
- klp_for_each_patch(patch)
- patch->forced = true;
+ /* Set forced flag for patches being removed. */
+ if (klp_target_state == KLP_UNPATCHED)
+ klp_transition_patch->forced = true;
+ else if (klp_transition_patch->replace) {
+ klp_for_each_patch(patch) {
+ if (patch != klp_transition_patch)
+ patch->forced = true;
+ }
+ }
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c06cab6546ed..a6e671b8608d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -60,7 +60,6 @@
#include "lockdep_internals.h"
-#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
#ifdef CONFIG_PROVE_LOCKING
@@ -1380,7 +1379,7 @@ static struct lock_list *alloc_list_entry(void)
*/
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
- unsigned long ip, u16 distance, u8 dep,
+ u16 distance, u8 dep,
const struct lock_trace *trace)
{
struct lock_list *entry;
@@ -3133,19 +3132,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* to the previous lock's dependency list:
*/
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
- &hlock_class(prev)->locks_after,
- next->acquire_ip, distance,
- calc_dep(prev, next),
- *trace);
+ &hlock_class(prev)->locks_after, distance,
+ calc_dep(prev, next), *trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
- &hlock_class(next)->locks_before,
- next->acquire_ip, distance,
- calc_depb(prev, next),
- *trace);
+ &hlock_class(next)->locks_before, distance,
+ calc_depb(prev, next), *trace);
if (!ret)
return 0;
@@ -4236,14 +4231,13 @@ static void __trace_hardirqs_on_caller(void)
/**
* lockdep_hardirqs_on_prepare - Prepare for enabling interrupts
- * @ip: Caller address
*
* Invoked before a possible transition to RCU idle from exit to user or
* guest mode. This ensures that all RCU operations are done before RCU
* stops watching. After the RCU transition lockdep_hardirqs_on() has to be
* invoked to set the final state.
*/
-void lockdep_hardirqs_on_prepare(unsigned long ip)
+void lockdep_hardirqs_on_prepare(void)
{
if (unlikely(!debug_locks))
return;
@@ -4840,8 +4834,7 @@ EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
static void
print_lock_nested_lock_not_held(struct task_struct *curr,
- struct held_lock *hlock,
- unsigned long ip)
+ struct held_lock *hlock)
{
if (!debug_locks_off())
return;
@@ -5017,7 +5010,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
if (nest_lock && !__lock_is_held(nest_lock, -1)) {
- print_lock_nested_lock_not_held(curr, hlock, ip);
+ print_lock_nested_lock_not_held(curr, hlock);
return 0;
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 5e3585950ec8..d973fe6041bf 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -30,6 +30,9 @@
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/lock.h>
+
#ifndef CONFIG_PREEMPT_RT
#include "mutex.h"
@@ -599,12 +602,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
if (__mutex_trylock(lock) ||
mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
+ trace_contention_end(lock, 0);
preempt_enable();
return 0;
}
@@ -641,6 +646,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
}
set_current_state(state);
+ trace_contention_begin(lock, LCB_F_MUTEX);
for (;;) {
bool first;
@@ -680,10 +686,16 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
* state back to RUNNING and fall through the next schedule(),
* or we must see its unlock and acquire.
*/
- if (__mutex_trylock_or_handoff(lock, first) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
+ if (__mutex_trylock_or_handoff(lock, first))
break;
+ if (first) {
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+ break;
+ trace_contention_begin(lock, LCB_F_MUTEX);
+ }
+
raw_spin_lock(&lock->wait_lock);
}
raw_spin_lock(&lock->wait_lock);
@@ -707,6 +719,7 @@ acquired:
skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
+ trace_contention_end(lock, 0);
if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
@@ -719,6 +732,7 @@ err:
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
+ trace_contention_end(lock, ret);
raw_spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index c9fdae94e098..5fe4c5495ba3 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -9,6 +9,7 @@
#include <linux/sched/task.h>
#include <linux/sched/debug.h>
#include <linux/errno.h>
+#include <trace/events/lock.h>
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *key)
@@ -171,9 +172,11 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
if (try)
return false;
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
preempt_enable();
percpu_rwsem_wait(sem, /* .reader = */ true);
preempt_disable();
+ trace_contention_end(sem, 0);
return true;
}
@@ -216,6 +219,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
@@ -237,6 +241,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
/* Wait for all active readers to complete. */
rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
+ trace_contention_end(sem, 0);
}
EXPORT_SYMBOL_GPL(percpu_down_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index ec36b73f4733..2e1600906c9f 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -12,10 +12,11 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/spinlock.h>
+#include <trace/events/lock.h>
/**
- * queued_read_lock_slowpath - acquire read lock of a queue rwlock
- * @lock: Pointer to queue rwlock structure
+ * queued_read_lock_slowpath - acquire read lock of a queued rwlock
+ * @lock: Pointer to queued rwlock structure
*/
void queued_read_lock_slowpath(struct qrwlock *lock)
{
@@ -34,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
}
atomic_sub(_QR_BIAS, &lock->cnts);
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);
+
/*
* Put the reader into the wait queue
*/
@@ -51,17 +54,21 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
* Signal the next one in queue to become queue head
*/
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queued_write_lock_slowpath - acquire write lock of a queue rwlock
- * @lock : Pointer to queue rwlock structure
+ * queued_write_lock_slowpath - acquire write lock of a queued rwlock
+ * @lock : Pointer to queued rwlock structure
*/
void queued_write_lock_slowpath(struct qrwlock *lock)
{
int cnts;
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);
+
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
@@ -79,5 +86,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
} while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index cbff6ba53d56..65a9a10caa6f 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -22,6 +22,7 @@
#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
+#include <trace/events/lock.h>
/*
* Include queued spinlock statistics code
@@ -401,6 +402,8 @@ pv_queue:
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
+ trace_contention_begin(lock, LCB_F_SPIN);
+
/*
* 4 nodes are allocated based on the assumption that there will
* not be nested NMIs taking spinlocks. That may not be true in
@@ -554,6 +557,8 @@ locked:
pv_kick_node(lock, next);
release:
+ trace_contention_end(lock, 0);
+
/*
* release the node
*/
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8555c4efe97c..7779ee8abc2a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -24,6 +24,8 @@
#include <linux/sched/wake_q.h>
#include <linux/ww_mutex.h>
+#include <trace/events/lock.h>
+
#include "rtmutex_common.h"
#ifndef WW_RT
@@ -1579,6 +1581,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
set_current_state(state);
+ trace_contention_begin(lock, LCB_F_RT);
+
ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk);
if (likely(!ret))
ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
@@ -1601,6 +1605,9 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
* unconditionally. We might have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
+
+ trace_contention_end(lock, ret);
+
return ret;
}
@@ -1683,6 +1690,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
/* Save current state and set state to TASK_RTLOCK_WAIT */
current_save_and_set_rtlock_wait_state();
+ trace_contention_begin(lock, LCB_F_RT);
+
task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK);
for (;;) {
@@ -1712,6 +1721,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
*/
fixup_rt_mutex_waiters(lock);
debug_rt_mutex_free_waiter(&waiter);
+
+ trace_contention_end(lock, 0);
}
static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 6fd3162e4098..c201aadb9301 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -112,6 +112,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
* Reader2 to call up_read(), which might be unbound.
*/
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
+
/*
* For rwlocks this returns 0 unconditionally, so the below
* !ret conditionals are optimized out.
@@ -130,6 +132,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
raw_spin_unlock_irq(&rtm->wait_lock);
if (!ret)
rwbase_rtmutex_unlock(rtm);
+
+ trace_contention_end(rwb, ret);
return ret;
}
@@ -247,11 +251,13 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
goto out_unlock;
rwbase_set_and_save_current_state(state);
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
for (;;) {
/* Optimized out for rwlocks */
if (rwbase_signal_pending_state(state, current)) {
rwbase_restore_current_state();
__rwbase_write_unlock(rwb, 0, flags);
+ trace_contention_end(rwb, -EINTR);
return -EINTR;
}
@@ -265,6 +271,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
set_current_state(state);
}
rwbase_restore_current_state();
+ trace_contention_end(rwb, 0);
out_unlock:
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index acde5d6f1254..9d1db4a54d34 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <trace/events/lock.h>
#ifndef CONFIG_PREEMPT_RT
#include "lock_events.h"
@@ -375,16 +376,19 @@ rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
*
* Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
* this function. Modify with care.
+ *
+ * Return: true if wait_list isn't empty and false otherwise
*/
-static inline void
+static inline bool
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
list_del(&waiter->list);
if (likely(!list_empty(&sem->wait_list)))
- return;
+ return true;
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+ return false;
}
/*
@@ -559,6 +563,33 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
}
/*
+ * Remove a waiter and try to wake up other waiters in the wait queue
+ * This function is called from the out_nolock path of both the reader and
+ * writer slowpaths with wait_lock held. It releases the wait_lock and
+ * optionally wake up waiters before it returns.
+ */
+static inline void
+rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
+ struct wake_q_head *wake_q)
+ __releases(&sem->wait_lock)
+{
+ bool first = rwsem_first_waiter(sem) == waiter;
+
+ wake_q_init(wake_q);
+
+ /*
+ * If the wait_list isn't empty and the waiter to be deleted is
+ * the first waiter, we wake up the remaining waiters as they may
+ * be eligible to acquire or spin on the lock.
+ */
+ if (rwsem_del_waiter(sem, waiter) && first)
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ if (!wake_q_empty(wake_q))
+ wake_up_q(wake_q);
+}
+
+/*
* This function must be called with the sem->wait_lock held to prevent
* race conditions between checking the rwsem wait list and setting the
* sem->count accordingly.
@@ -901,7 +932,7 @@ done:
*/
static inline void clear_nonspinnable(struct rw_semaphore *sem)
{
- if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
+ if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
}
@@ -926,6 +957,31 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
#endif
/*
+ * Prepare to wake up waiter(s) in the wait queue by putting them into the
+ * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
+ * reader-owned, wake up read lock waiters in queue front or wake up any
+ * front waiter otherwise.
+
+ * This is being called from both reader and writer slow paths.
+ */
+static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
+ struct wake_q_head *wake_q)
+{
+ enum rwsem_wake_type wake_type;
+
+ if (count & RWSEM_WRITER_MASK)
+ return;
+
+ if (count & RWSEM_READER_MASK) {
+ wake_type = RWSEM_WAKE_READERS;
+ } else {
+ wake_type = RWSEM_WAKE_ANY;
+ clear_nonspinnable(sem);
+ }
+ rwsem_mark_wake(sem, wake_type, wake_q);
+}
+
+/*
* Wait for the read lock to be granted
*/
static struct rw_semaphore __sched *
@@ -935,7 +991,6 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
long rcnt = (count >> RWSEM_READER_SHIFT);
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
- bool wake = false;
/*
* To prevent a constant stream of readers from starving a sleeping
@@ -977,12 +1032,11 @@ queue:
if (list_empty(&sem->wait_list)) {
/*
* In case the wait queue is empty and the lock isn't owned
- * by a writer or has the handoff bit set, this reader can
- * exit the slowpath and return immediately as its
- * RWSEM_READER_BIAS has already been set in the count.
+ * by a writer, this reader can exit the slowpath and return
+ * immediately as its RWSEM_READER_BIAS has already been set
+ * in the count.
*/
- if (!(atomic_long_read(&sem->count) &
- (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
/* Provide lock ACQUIRE */
smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(&sem->wait_lock);
@@ -997,22 +1051,13 @@ queue:
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
- /*
- * If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (!(count & RWSEM_LOCK_MASK)) {
- clear_nonspinnable(sem);
- wake = true;
- }
- if (wake || (!(count & RWSEM_WRITER_MASK) &&
- (adjustment & RWSEM_FLAG_WAITERS)))
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
+ rwsem_cond_wake_waiter(sem, count, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
+
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
+
+ trace_contention_begin(sem, LCB_F_READ);
/* wait to be given the lock */
for (;;) {
@@ -1035,13 +1080,14 @@ queue:
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
+ trace_contention_end(sem, 0);
return sem;
out_nolock:
- rwsem_del_waiter(sem, &waiter);
- raw_spin_unlock_irq(&sem->wait_lock);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
@@ -1051,7 +1097,6 @@ out_nolock:
static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
- long count;
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
@@ -1075,23 +1120,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
/* we're now waiting on the lock */
if (rwsem_first_waiter(sem) != &waiter) {
- count = atomic_long_read(&sem->count);
-
- /*
- * If there were already threads queued before us and:
- * 1) there are no active locks, wake the front
- * queued process(es) as the handoff bit might be set.
- * 2) there are no active writers and some readers, the lock
- * must be read owned; so we try to wake any read lock
- * waiters that were queued ahead of us.
- */
- if (count & RWSEM_WRITER_MASK)
- goto wait;
-
- rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
- ? RWSEM_WAKE_READERS
- : RWSEM_WAKE_ANY, &wake_q);
-
+ rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
+ &wake_q);
if (!wake_q_empty(&wake_q)) {
/*
* We want to minimize wait_lock hold time especially
@@ -1099,16 +1129,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
*/
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
- wake_q_init(&wake_q); /* Used again, reinit */
raw_spin_lock_irq(&sem->wait_lock);
}
} else {
atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
}
-wait:
/* wait until we successfully acquire the lock */
set_current_state(state);
+ trace_contention_begin(sem, LCB_F_WRITE);
+
for (;;) {
if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
@@ -1148,17 +1178,15 @@ trylock_again:
__set_current_state(TASK_RUNNING);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
+ trace_contention_end(sem, 0);
return sem;
out_nolock:
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock);
- rwsem_del_waiter(sem, &waiter);
- if (!list_empty(&sem->wait_list))
- rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
lockevent_inc(rwsem_wlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9ee381e4d2a4..f2654d2fe43a 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -32,6 +32,7 @@
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
+#include <trace/events/lock.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
@@ -205,7 +206,7 @@ struct semaphore_waiter {
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
-static inline int __sched __down_common(struct semaphore *sem, long state,
+static inline int __sched ___down_common(struct semaphore *sem, long state,
long timeout)
{
struct semaphore_waiter waiter;
@@ -236,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
return -EINTR;
}
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ int ret;
+
+ trace_contention_begin(sem, 0);
+ ret = ___down_common(sem, state, timeout);
+ trace_contention_end(sem, ret);
+
+ return ret;
+}
+
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
diff --git a/kernel/panic.c b/kernel/panic.c
index 7d422597403f..6737b2332275 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -48,7 +48,7 @@ unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
static unsigned long tainted_mask =
- IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
+ IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
#define PANIC_PRINT_LOCK_INFO 0x00000008
#define PANIC_PRINT_FTRACE_INFO 0x00000010
#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
+#define PANIC_PRINT_ALL_CPU_BT 0x00000040
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -147,10 +148,16 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
}
EXPORT_SYMBOL(nmi_panic);
-static void panic_print_sys_info(void)
+static void panic_print_sys_info(bool console_flush)
{
- if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
- console_flush_on_panic(CONSOLE_REPLAY_ALL);
+ if (console_flush) {
+ if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
+ return;
+ }
+
+ if (panic_print & PANIC_PRINT_ALL_CPU_BT)
+ trigger_all_cpu_backtrace();
if (panic_print & PANIC_PRINT_TASK_INFO)
show_state();
@@ -185,6 +192,16 @@ void panic(const char *fmt, ...)
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ }
+
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -272,6 +289,8 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
+ panic_print_sys_info(false);
+
kmsg_dump(KMSG_DUMP_PANIC);
/*
@@ -302,7 +321,7 @@ void panic(const char *fmt, ...)
debug_locks_off();
console_flush_on_panic(CONSOLE_FLUSH_PENDING);
- panic_print_sys_info();
+ panic_print_sys_info(true);
if (!panic_blink)
panic_blink = no_blink;
@@ -578,16 +597,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (regs)
show_regs(regs);
- if (panic_on_warn) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
+ if (panic_on_warn)
panic("panic_on_warn set ...\n");
- }
if (!regs)
dump_stack();
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 5899260a8bef..874ad834dc8d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
+ifeq ($(CONFIG_DYNAMIC_DEBUG), y)
+CFLAGS_swap.o := -DDEBUG
+CFLAGS_snapshot.o := -DDEBUG
+CFLAGS_energy_model.o := -DDEBUG
+endif
KASAN_SANITIZE_snapshot.o := n
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0153b0ca7b23..6c373f2960e7 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -54,28 +54,15 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
-static int em_debug_units_show(struct seq_file *s, void *unused)
+static int em_debug_flags_show(struct seq_file *s, void *unused)
{
struct em_perf_domain *pd = s->private;
- char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ?
- "milliWatts" : "bogoWatts";
- seq_printf(s, "%s\n", units);
+ seq_printf(s, "%#lx\n", pd->flags);
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(em_debug_units);
-
-static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused)
-{
- struct em_perf_domain *pd = s->private;
- int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0;
-
- seq_printf(s, "%d\n", enabled);
-
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies);
+DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
static void em_debug_create_pd(struct device *dev)
{
@@ -89,9 +76,8 @@ static void em_debug_create_pd(struct device *dev)
debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
&em_debug_cpus_fops);
- debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
- debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd,
- &em_debug_skip_inefficiencies_fops);
+ debugfs_create_file("flags", 0444, d, dev->em_pd,
+ &em_debug_flags_fops);
/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
@@ -121,7 +107,8 @@ static void em_debug_remove_pd(struct device *dev) {}
#endif
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
- int nr_states, struct em_data_callback *cb)
+ int nr_states, struct em_data_callback *cb,
+ unsigned long flags)
{
unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
struct em_perf_state *table;
@@ -139,7 +126,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
* lowest performance state of 'dev' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
- ret = cb->active_power(&power, &freq, dev);
+ ret = cb->active_power(dev, &power, &freq);
if (ret) {
dev_err(dev, "EM: invalid perf. state: %d\n",
ret);
@@ -173,10 +160,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = nr_states - 1; i >= 0; i--) {
- unsigned long power_res = em_scale_power(table[i].power);
+ unsigned long power_res, cost;
+
+ if (flags & EM_PERF_DOMAIN_ARTIFICIAL) {
+ ret = cb->get_cost(dev, table[i].frequency, &cost);
+ if (ret || !cost || cost > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid cost %lu %d\n",
+ cost, ret);
+ goto free_ps_table;
+ }
+ } else {
+ power_res = em_scale_power(table[i].power);
+ cost = div64_u64(fmax * power_res, table[i].frequency);
+ }
+
+ table[i].cost = cost;
- table[i].cost = div64_u64(fmax * power_res,
- table[i].frequency);
if (table[i].cost >= prev_cost) {
table[i].flags = EM_PERF_STATE_INEFFICIENT;
dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
@@ -197,7 +196,8 @@ free_ps_table:
}
static int em_create_pd(struct device *dev, int nr_states,
- struct em_data_callback *cb, cpumask_t *cpus)
+ struct em_data_callback *cb, cpumask_t *cpus,
+ unsigned long flags)
{
struct em_perf_domain *pd;
struct device *cpu_dev;
@@ -215,7 +215,7 @@ static int em_create_pd(struct device *dev, int nr_states,
return -ENOMEM;
}
- ret = em_create_perf_table(dev, pd, nr_states, cb);
+ ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
if (ret) {
kfree(pd);
return ret;
@@ -259,6 +259,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev)
found++;
}
+ cpufreq_cpu_put(policy);
+
if (!found)
return;
@@ -332,6 +334,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
bool milliwatts)
{
unsigned long cap, prev_cap = 0;
+ unsigned long flags = 0;
int cpu, ret;
if (!dev || !nr_states || !cb)
@@ -378,12 +381,16 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
}
}
- ret = em_create_pd(dev, nr_states, cb, cpus);
+ if (milliwatts)
+ flags |= EM_PERF_DOMAIN_MILLIWATTS;
+ else if (cb->get_cost)
+ flags |= EM_PERF_DOMAIN_ARTIFICIAL;
+
+ ret = em_create_pd(dev, nr_states, cb, cpus, flags);
if (ret)
goto unlock;
- if (milliwatts)
- dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS;
+ dev->em_pd->flags |= flags;
em_cpufreq_update_efficiencies(dev);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7e646079fbeb..5242bf2ee469 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -545,35 +545,6 @@ static int __init pm_debug_messages_setup(char *str)
}
__setup("pm_debug_messages", pm_debug_messages_setup);
-/**
- * __pm_pr_dbg - Print a suspend debug message to the kernel log.
- * @defer: Whether or not to use printk_deferred() to print the message.
- * @fmt: Message format.
- *
- * The message will be emitted if enabled through the pm_debug_messages
- * sysfs attribute.
- */
-void __pm_pr_dbg(bool defer, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- if (!pm_debug_messages_on)
- return;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (defer)
- printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
- else
- printk(KERN_DEBUG "PM: %pV", &vaf);
-
- va_end(args);
-}
-
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 11b570fcf049..3068601e585a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -6,9 +6,6 @@
* Originally from swsusp.
*/
-
-#undef DEBUG
-
#include <linux/interrupt.h>
#include <linux/oom.h>
#include <linux/suspend.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 330d49937692..2a406753af90 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -326,7 +326,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
return ret;
}
-/**
+/*
* Data types related to memory bitmaps.
*
* Memory bitmap is a structure consisting of many linked lists of
@@ -427,6 +427,10 @@ struct memory_bitmap {
/**
* alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages not used before hibernation (restore only)
+ * @ca: Pointer to a linked list of pages ("a chain") to allocate from
+ * @list: Radix Tree node to add.
*
* This function is used to allocate inner nodes as well as the
* leave nodes of the radix tree. It also adds the node to the
@@ -902,7 +906,7 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/**
- * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * memory_bm_next_pfn - Find the next set bit in a memory bitmap.
* @bm: Memory bitmap.
*
* Starting from the last returned position this function searches for the next
@@ -1937,7 +1941,7 @@ static inline int get_highmem_buffer(int safe_needed)
}
/**
- * alloc_highmem_image_pages - Allocate some highmem pages for the image.
+ * alloc_highmem_pages - Allocate some highmem pages for the image.
*
* Try to allocate as many pages as needed, but if the number of free highmem
* pages is less than that, allocate them all.
@@ -2224,7 +2228,7 @@ static int check_header(struct swsusp_info *info)
}
/**
- * load header - Check the image header and copy the data from it.
+ * load_header - Check the image header and copy the data from it.
*/
static int load_header(struct swsusp_info *info)
{
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eea265082e97..ccc4b465775b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -371,6 +371,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
+static int check_ptrace_options(unsigned long data)
+{
+ if (data & ~(unsigned long)PTRACE_O_MASK)
+ return -EINVAL;
+
+ if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+ if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ !IS_ENABLED(CONFIG_SECCOMP))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+ current->ptrace & PT_SUSPEND_SECCOMP)
+ return -EPERM;
+ }
+ return 0;
+}
+
static int ptrace_attach(struct task_struct *task, long request,
unsigned long addr,
unsigned long flags)
@@ -382,8 +402,16 @@ static int ptrace_attach(struct task_struct *task, long request,
if (seize) {
if (addr != 0)
goto out;
+ /*
+ * This duplicates the check in check_ptrace_options() because
+ * ptrace_attach() and ptrace_setoptions() have historically
+ * used different error codes for unknown ptrace options.
+ */
if (flags & ~(unsigned long)PTRACE_O_MASK)
goto out;
+ retval = check_ptrace_options(flags);
+ if (retval)
+ return retval;
flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
} else {
flags = PT_PTRACED;
@@ -654,22 +682,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
unsigned flags;
+ int ret;
- if (data & ~(unsigned long)PTRACE_O_MASK)
- return -EINVAL;
-
- if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
- if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
- !IS_ENABLED(CONFIG_SECCOMP))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
- current->ptrace & PT_SUSPEND_SECCOMP)
- return -EPERM;
- }
+ ret = check_ptrace_options(data);
+ if (ret)
+ return ret;
/* Avoid intermediate state when all opts are cleared */
flags = child->ptrace;
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index bf8e341e75b4..1c630e573548 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -77,31 +77,56 @@ config TASKS_RCU_GENERIC
This option enables generic infrastructure code supporting
task-based RCU implementations. Not for manual selection.
+config FORCE_TASKS_RCU
+ bool "Force selection of TASKS_RCU"
+ depends on RCU_EXPERT
+ select TASKS_RCU
+ default n
+ help
+ This option force-enables a task-based RCU implementation
+ that uses only voluntary context switch (not preemption!),
+ idle, and user-mode execution as quiescent states. Not for
+ manual selection in most cases.
+
config TASKS_RCU
- def_bool PREEMPTION
+ bool
+ default n
+ select IRQ_WORK
+
+config FORCE_TASKS_RUDE_RCU
+ bool "Force selection of Tasks Rude RCU"
+ depends on RCU_EXPERT
+ select TASKS_RUDE_RCU
+ default n
help
- This option enables a task-based RCU implementation that uses
- only voluntary context switch (not preemption!), idle, and
- user-mode execution as quiescent states. Not for manual selection.
+ This option force-enables a task-based RCU implementation
+ that uses only context switch (including preemption) and
+ user-mode execution as quiescent states. It forces IPIs and
+ context switches on all online CPUs, including idle ones,
+ so use with caution. Not for manual selection in most cases.
config TASKS_RUDE_RCU
- def_bool 0
+ bool
+ default n
+ select IRQ_WORK
+
+config FORCE_TASKS_TRACE_RCU
+ bool "Force selection of Tasks Trace RCU"
+ depends on RCU_EXPERT
+ select TASKS_TRACE_RCU
+ default n
help
This option enables a task-based RCU implementation that uses
- only context switch (including preemption) and user-mode
- execution as quiescent states. It forces IPIs and context
- switches on all online CPUs, including idle ones, so use
- with caution.
+ explicit rcu_read_lock_trace() read-side markers, and allows
+ these readers to appear in the idle loop as well as on the
+ CPU hotplug code paths. It can force IPIs on online CPUs,
+ including idle ones, so use with caution. Not for manual
+ selection in most cases.
config TASKS_TRACE_RCU
- def_bool 0
+ bool
+ default n
select IRQ_WORK
- help
- This option enables a task-based RCU implementation that uses
- explicit rcu_read_lock_trace() read-side markers, and allows
- these readers to appear in the idle loop as well as on the CPU
- hotplug code paths. It can force IPIs on online CPUs, including
- idle ones, so use with caution.
config RCU_STALL_COMMON
def_bool TREE_RCU
@@ -195,6 +220,20 @@ config RCU_BOOST_DELAY
Accept the default if unsure.
+config RCU_EXP_KTHREAD
+ bool "Perform RCU expedited work in a real-time kthread"
+ depends on RCU_BOOST && RCU_EXPERT
+ default !PREEMPT_RT && NR_CPUS <= 32
+ help
+ Use this option to further reduce the latencies of expedited
+ grace periods at the expense of being more disruptive.
+
+ This option is disabled by default on PREEMPT_RT=y kernels which
+ disable expedited grace periods after boot by unconditionally
+ setting rcupdate.rcu_normal_after_boot=1.
+
+ Accept the default if unsure.
+
config RCU_NOCB_CPU
bool "Offload RCU callback processing from boot-selected CPUs"
depends on TREE_RCU
@@ -225,7 +264,7 @@ config RCU_NOCB_CPU
config TASKS_TRACE_RCU_READ_MB
bool "Tasks Trace RCU readers use memory barriers in user and idle"
- depends on RCU_EXPERT
+ depends on RCU_EXPERT && TASKS_TRACE_RCU
default PREEMPT_RT || NR_CPUS < 8
help
Use this option to further reduce the number of IPIs sent
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 4fd64999300f..9b64e55d4f61 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -28,9 +28,6 @@ config RCU_SCALE_TEST
depends on DEBUG_KERNEL
select TORTURE_TEST
select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs performance
@@ -47,9 +44,6 @@ config RCU_TORTURE_TEST
depends on DEBUG_KERNEL
select TORTURE_TEST
select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs torture tests
@@ -66,9 +60,6 @@ config RCU_REF_SCALE_TEST
depends on DEBUG_KERNEL
select TORTURE_TEST
select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs performance tests
@@ -91,6 +82,20 @@ config RCU_CPU_STALL_TIMEOUT
RCU grace period persists, additional CPU stall warnings are
printed at more widely spaced intervals.
+config RCU_EXP_CPU_STALL_TIMEOUT
+ int "Expedited RCU CPU stall timeout in milliseconds"
+ depends on RCU_STALL_COMMON
+ range 0 21000
+ default 20 if ANDROID
+ default 0 if !ANDROID
+ help
+ If a given expedited RCU grace period extends more than the
+ specified number of milliseconds, a CPU stall warning is printed.
+ If the RCU grace period persists, additional CPU stall warnings
+ are printed at more widely spaced intervals. A value of zero
+ says to use the RCU_CPU_STALL_TIMEOUT value converted from
+ seconds to milliseconds.
+
config RCU_TRACE
bool "Enable tracing for RCU"
depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 24b5f2c2de87..152492d52715 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -210,7 +210,9 @@ static inline bool rcu_stall_is_suppressed_at_boot(void)
extern int rcu_cpu_stall_ftrace_dump;
extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
+extern int rcu_exp_cpu_stall_timeout;
int rcu_jiffies_till_stall_check(void);
+int rcu_exp_jiffies_till_stall_check(void);
static inline bool rcu_stall_is_suppressed(void)
{
@@ -523,6 +525,8 @@ static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { ret
static inline void show_rcu_gp_kthreads(void) { }
static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
static inline void rcu_fwd_progress_check(unsigned long j) { }
+static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
+static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
@@ -534,14 +538,19 @@ int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
void rcu_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+extern struct kthread_worker *rcu_exp_gp_kworker;
+extern struct kthread_worker *rcu_exp_par_gp_kworker;
+#else /* !CONFIG_RCU_EXP_KTHREAD */
extern struct workqueue_struct *rcu_par_gp_wq;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+void rcu_gp_slow_register(atomic_t *rgssp);
+void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU
-bool rcu_is_nocb_cpu(int cpu);
void rcu_bind_current_to_nocb(void);
#else
-static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
static inline void rcu_bind_current_to_nocb(void) { }
#endif
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 81145c3ece25..c54ea2b6a36b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -505,10 +505,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
/*
- * Callbacks moved, so clean up the misordered ->tails[] pointers
- * that now point into the middle of the list of ready-to-invoke
- * callbacks. The overall effect is to copy down the later pointers
- * into the gap that was created by the now-ready segments.
+ * Callbacks moved, so there might be an empty RCU_WAIT_TAIL
+ * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the
+ * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap
+ * created by the now-ready-to-invoke segments.
*/
for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 5e4f1f83d38e..277a5bfb37d4 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -268,6 +268,8 @@ static struct rcu_scale_ops srcud_ops = {
.name = "srcud"
};
+#ifdef CONFIG_TASKS_RCU
+
/*
* Definitions for RCU-tasks scalability testing.
*/
@@ -295,6 +297,16 @@ static struct rcu_scale_ops tasks_ops = {
.name = "tasks"
};
+#define TASKS_OPS &tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
/*
* Definitions for RCU-tasks-trace scalability testing.
*/
@@ -324,6 +336,14 @@ static struct rcu_scale_ops tasks_tracing_ops = {
.name = "tasks-tracing"
};
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old)
{
if (!cur_ops->gp_diff)
@@ -797,7 +817,7 @@ rcu_scale_init(void)
long i;
int firsterr = 0;
static struct rcu_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops
+ &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS
};
if (!torture_init_begin(scale_type, verbose))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 55d049c39608..7120165a9342 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -738,6 +738,50 @@ static struct rcu_torture_ops busted_srcud_ops = {
};
/*
+ * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
+ * This implementation does not necessarily work well with CPU hotplug.
+ */
+
+static void synchronize_rcu_trivial(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
+ WARN_ON_ONCE(raw_smp_processor_id() != cpu);
+ }
+}
+
+static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
+{
+ preempt_enable();
+}
+
+static struct rcu_torture_ops trivial_ops = {
+ .ttype = RCU_TRIVIAL_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock_trivial,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock_trivial,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_no_completed,
+ .sync = synchronize_rcu_trivial,
+ .exp_sync = synchronize_rcu_trivial,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "trivial"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
* Definitions for RCU-tasks torture testing.
*/
@@ -780,47 +824,16 @@ static struct rcu_torture_ops tasks_ops = {
.name = "tasks"
};
-/*
- * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
- * This implementation does not necessarily work well with CPU hotplug.
- */
+#define TASKS_OPS &tasks_ops,
-static void synchronize_rcu_trivial(void)
-{
- int cpu;
+#else // #ifdef CONFIG_TASKS_RCU
- for_each_online_cpu(cpu) {
- rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
- WARN_ON_ONCE(raw_smp_processor_id() != cpu);
- }
-}
+#define TASKS_OPS
-static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
-{
- preempt_disable();
- return 0;
-}
+#endif // #else #ifdef CONFIG_TASKS_RCU
-static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
-{
- preempt_enable();
-}
-static struct rcu_torture_ops trivial_ops = {
- .ttype = RCU_TRIVIAL_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = rcu_torture_read_lock_trivial,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_torture_read_unlock_trivial,
- .readlock_held = torture_readlock_not_held,
- .get_gp_seq = rcu_no_completed,
- .sync = synchronize_rcu_trivial,
- .exp_sync = synchronize_rcu_trivial,
- .fqs = NULL,
- .stats = NULL,
- .irq_capable = 1,
- .name = "trivial"
-};
+#ifdef CONFIG_TASKS_RUDE_RCU
/*
* Definitions for rude RCU-tasks torture testing.
@@ -851,6 +864,17 @@ static struct rcu_torture_ops tasks_rude_ops = {
.name = "tasks-rude"
};
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU
+
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
/*
* Definitions for tracing RCU-tasks torture testing.
*/
@@ -893,6 +917,15 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.name = "tasks-tracing"
};
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU
+
+
static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
{
if (!cur_ops->gp_diff)
@@ -1178,7 +1211,7 @@ rcu_torture_writer(void *arg)
" GP expediting controlled from boot/sysfs for %s.\n",
torture_type, cur_ops->name);
if (WARN_ONCE(nsynctypes == 0,
- "rcu_torture_writer: No update-side primitives.\n")) {
+ "%s: No update-side primitives.\n", __func__)) {
/*
* No updates primitives, so don't try updating.
* The resulting test won't be testing much, hence the
@@ -1186,6 +1219,7 @@ rcu_torture_writer(void *arg)
*/
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
+ return 0;
}
do {
@@ -1322,6 +1356,17 @@ rcu_torture_fakewriter(void *arg)
VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
set_user_nice(current, MAX_NICE);
+ if (WARN_ONCE(nsynctypes == 0,
+ "%s: No update-side primitives.\n", __func__)) {
+ /*
+ * No updates primitives, so don't try updating.
+ * The resulting test won't be testing much, hence the
+ * above WARN_ONCE().
+ */
+ torture_kthread_stopping("rcu_torture_fakewriter");
+ return 0;
+ }
+
do {
torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
@@ -2916,10 +2961,12 @@ rcu_torture_cleanup(void)
pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
}
+ rcu_gp_slow_unregister(NULL);
return;
}
if (!cur_ops) {
torture_cleanup_end();
+ rcu_gp_slow_unregister(NULL);
return;
}
@@ -3016,6 +3063,7 @@ rcu_torture_cleanup(void)
else
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
torture_cleanup_end();
+ rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay);
}
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -3096,9 +3144,9 @@ rcu_torture_init(void)
int flags = 0;
unsigned long gp_seq = 0;
static struct rcu_torture_ops *torture_ops[] = {
- &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &busted_srcud_ops, &tasks_ops, &tasks_rude_ops,
- &tasks_tracing_ops, &trivial_ops,
+ &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops,
+ TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
+ &trivial_ops,
};
if (!torture_init_begin(torture_type, verbose))
@@ -3320,6 +3368,7 @@ rcu_torture_init(void)
if (object_debug)
rcu_test_debug_objects();
torture_init_end();
+ rcu_gp_slow_register(&rcu_fwd_cb_nodelay);
return 0;
unwind:
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 5489ff7f478e..909644abee67 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -207,6 +207,8 @@ static struct ref_scale_ops srcu_ops = {
.name = "srcu"
};
+#ifdef CONFIG_TASKS_RCU
+
// Definitions for RCU Tasks ref scale testing: Empty read markers.
// These definitions also work for RCU Rude readers.
static void rcu_tasks_ref_scale_read_section(const int nloops)
@@ -232,6 +234,16 @@ static struct ref_scale_ops rcu_tasks_ops = {
.name = "rcu-tasks"
};
+#define RCU_TASKS_OPS &rcu_tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define RCU_TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
// Definitions for RCU Tasks Trace ref scale testing.
static void rcu_trace_ref_scale_read_section(const int nloops)
{
@@ -261,6 +273,14 @@ static struct ref_scale_ops rcu_trace_ops = {
.name = "rcu-trace"
};
+#define RCU_TRACE_OPS &rcu_trace_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define RCU_TRACE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
// Definitions for reference count
static atomic_t refcnt;
@@ -790,7 +810,7 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops,
+ &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
&rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
};
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6833d8887181..50ba70f019de 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -24,6 +24,7 @@
#include <linux/smp.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/srcu.h>
#include "rcu.h"
@@ -38,6 +39,35 @@ module_param(exp_holdoff, ulong, 0444);
static ulong counter_wrap_check = (ULONG_MAX >> 2);
module_param(counter_wrap_check, ulong, 0444);
+/*
+ * Control conversion to SRCU_SIZE_BIG:
+ * 0: Don't convert at all.
+ * 1: Convert at init_srcu_struct() time.
+ * 2: Convert when rcutorture invokes srcu_torture_stats_print().
+ * 3: Decide at boot time based on system shape (default).
+ * 0x1x: Convert when excessive contention encountered.
+ */
+#define SRCU_SIZING_NONE 0
+#define SRCU_SIZING_INIT 1
+#define SRCU_SIZING_TORTURE 2
+#define SRCU_SIZING_AUTO 3
+#define SRCU_SIZING_CONTEND 0x10
+#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x)
+#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE))
+#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT))
+#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE))
+#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND)
+static int convert_to_big = SRCU_SIZING_AUTO;
+module_param(convert_to_big, int, 0444);
+
+/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */
+static int big_cpu_lim __read_mostly = 128;
+module_param(big_cpu_lim, int, 0444);
+
+/* Contention events per jiffy to initiate transition to big. */
+static int small_contention_lim __read_mostly = 100;
+module_param(small_contention_lim, int, 0444);
+
/* Early-boot callback-management, so early that no lock is required! */
static LIST_HEAD(srcu_boot_list);
static bool __read_mostly srcu_init_done;
@@ -48,39 +78,90 @@ static void process_srcu(struct work_struct *work);
static void srcu_delay_timer(struct timer_list *t);
/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
-#define spin_lock_rcu_node(p) \
-do { \
- spin_lock(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_rcu_node(p) \
+do { \
+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
} while (0)
#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
-#define spin_lock_irq_rcu_node(p) \
-do { \
- spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_irq_rcu_node(p) \
+do { \
+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
} while (0)
-#define spin_unlock_irq_rcu_node(p) \
+#define spin_unlock_irq_rcu_node(p) \
spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
-#define spin_lock_irqsave_rcu_node(p, flags) \
-do { \
- spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
} while (0)
-#define spin_unlock_irqrestore_rcu_node(p, flags) \
- spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+#define spin_trylock_irqsave_rcu_node(p, flags) \
+({ \
+ bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
+#define spin_unlock_irqrestore_rcu_node(p, flags) \
+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
/*
- * Initialize SRCU combining tree. Note that statically allocated
+ * Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
* srcu_read_unlock() running against them. So if the is_static parameter
* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
*/
-static void init_srcu_struct_nodes(struct srcu_struct *ssp)
+static void init_srcu_struct_data(struct srcu_struct *ssp)
+{
+ int cpu;
+ struct srcu_data *sdp;
+
+ /*
+ * Initialize the per-CPU srcu_data array, which feeds into the
+ * leaves of the srcu_node tree.
+ */
+ WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+ ARRAY_SIZE(sdp->srcu_unlock_count));
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ rcu_segcblist_init(&sdp->srcu_cblist);
+ sdp->srcu_cblist_invoking = false;
+ sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
+ sdp->mynode = NULL;
+ sdp->cpu = cpu;
+ INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+ timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
+ sdp->ssp = ssp;
+ }
+}
+
+/* Invalid seq state, used during snp node initialization */
+#define SRCU_SNP_INIT_SEQ 0x2
+
+/*
+ * Check whether sequence number corresponding to snp node,
+ * is invalid.
+ */
+static inline bool srcu_invl_snp_seq(unsigned long s)
+{
+ return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ;
+}
+
+/*
+ * Allocated and initialize SRCU combining tree. Returns @true if
+ * allocation succeeded and @false otherwise.
+ */
+static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
{
int cpu;
int i;
@@ -92,6 +173,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
/* Initialize geometry if it has not already been initialized. */
rcu_init_geometry();
+ ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags);
+ if (!ssp->node)
+ return false;
/* Work out the overall tree geometry. */
ssp->level[0] = &ssp->node[0];
@@ -105,10 +189,10 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
- snp->srcu_have_cbs[i] = 0;
+ snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ;
snp->srcu_data_have_cbs[i] = 0;
}
- snp->srcu_gp_seq_needed_exp = 0;
+ snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
snp->grplo = -1;
snp->grphi = -1;
if (snp == &ssp->node[0]) {
@@ -129,39 +213,31 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
* Initialize the per-CPU srcu_data array, which feeds into the
* leaves of the srcu_node tree.
*/
- WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
- ARRAY_SIZE(sdp->srcu_unlock_count));
level = rcu_num_lvls - 1;
snp_first = ssp->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
- rcu_segcblist_init(&sdp->srcu_cblist);
- sdp->srcu_cblist_invoking = false;
- sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
- sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
sdp->mynode = &snp_first[cpu / levelspread[level]];
for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
if (snp->grplo < 0)
snp->grplo = cpu;
snp->grphi = cpu;
}
- sdp->cpu = cpu;
- INIT_WORK(&sdp->work, srcu_invoke_callbacks);
- timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
- sdp->ssp = ssp;
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
}
+ smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+ return true;
}
/*
* Initialize non-compile-time initialized fields, including the
- * associated srcu_node and srcu_data structures. The is_static
- * parameter is passed through to init_srcu_struct_nodes(), and
- * also tells us that ->sda has already been wired up to srcu_data.
+ * associated srcu_node and srcu_data structures. The is_static parameter
+ * tells us that ->sda has already been wired up to srcu_data.
*/
static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
{
+ ssp->srcu_size_state = SRCU_SIZE_SMALL;
+ ssp->node = NULL;
mutex_init(&ssp->srcu_cb_mutex);
mutex_init(&ssp->srcu_gp_mutex);
ssp->srcu_idx = 0;
@@ -170,13 +246,25 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
mutex_init(&ssp->srcu_barrier_mutex);
atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
INIT_DELAYED_WORK(&ssp->work, process_srcu);
+ ssp->sda_is_static = is_static;
if (!is_static)
ssp->sda = alloc_percpu(struct srcu_data);
if (!ssp->sda)
return -ENOMEM;
- init_srcu_struct_nodes(ssp);
+ init_srcu_struct_data(ssp);
ssp->srcu_gp_seq_needed_exp = 0;
ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
+ if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
+ if (!ssp->sda_is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ return -ENOMEM;
+ }
+ } else {
+ WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG);
+ }
+ }
smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
return 0;
}
@@ -214,6 +302,86 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
+ * Initiate a transition to SRCU_SIZE_BIG with lock held.
+ */
+static void __srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
+ smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC);
+}
+
+/*
+ * Initiate an idempotent transition to SRCU_SIZE_BIG.
+ */
+static void srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ unsigned long flags;
+
+ /* Double-checked locking on ->srcu_size-state. */
+ if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL)
+ return;
+ spin_lock_irqsave_rcu_node(ssp, flags);
+ if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) {
+ spin_unlock_irqrestore_rcu_node(ssp, flags);
+ return;
+ }
+ __srcu_transition_to_big(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp, flags);
+}
+
+/*
+ * Check to see if the just-encountered contention event justifies
+ * a transition to SRCU_SIZE_BIG.
+ */
+static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
+{
+ unsigned long j;
+
+ if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state)
+ return;
+ j = jiffies;
+ if (ssp->srcu_size_jiffies != j) {
+ ssp->srcu_size_jiffies = j;
+ ssp->srcu_n_lock_retries = 0;
+ }
+ if (++ssp->srcu_n_lock_retries <= small_contention_lim)
+ return;
+ __srcu_transition_to_big(ssp);
+}
+
+/*
+ * Acquire the specified srcu_data structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
+{
+ struct srcu_struct *ssp = sdp->ssp;
+
+ if (spin_trylock_irqsave_rcu_node(sdp, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp, *flags);
+ spin_lock_irqsave_rcu_node(sdp, *flags);
+}
+
+/*
+ * Acquire the specified srcu_struct structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+{
+ if (spin_trylock_irqsave_rcu_node(ssp, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+}
+
+/*
* First-use initialization of statically allocated srcu_struct
* structure. Wiring up the combining tree is more than can be
* done with compile-time initialization, so this check is added
@@ -343,7 +511,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
return sum;
}
-#define SRCU_INTERVAL 1
+#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
+#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances.
/*
* Return grace-period delay, zero if there are expedited grace
@@ -351,10 +522,18 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
*/
static unsigned long srcu_get_delay(struct srcu_struct *ssp)
{
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq),
- READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
- return 0;
- return SRCU_INTERVAL;
+ unsigned long jbase = SRCU_INTERVAL;
+
+ if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ jbase = 0;
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)))
+ jbase += jiffies - READ_ONCE(ssp->srcu_gp_start);
+ if (!jbase) {
+ WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
+ jbase = 1;
+ }
+ return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
}
/**
@@ -382,13 +561,20 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
return; /* Forgot srcu_barrier(), so just leak it! */
}
if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) ||
WARN_ON(srcu_readers_active(ssp))) {
- pr_info("%s: Active srcu_struct %p state: %d\n",
- __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)));
+ pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
+ __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)),
+ rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed);
return; /* Caller forgot to stop doing call_srcu()? */
}
- free_percpu(ssp->sda);
- ssp->sda = NULL;
+ if (!ssp->sda_is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ }
+ kfree(ssp->node);
+ ssp->node = NULL;
+ ssp->srcu_size_state = SRCU_SIZE_SMALL;
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
@@ -434,9 +620,13 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
*/
static void srcu_gp_start(struct srcu_struct *ssp)
{
- struct srcu_data *sdp = this_cpu_ptr(ssp->sda);
+ struct srcu_data *sdp;
int state;
+ if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ sdp = per_cpu_ptr(ssp->sda, 0);
+ else
+ sdp = this_cpu_ptr(ssp->sda);
lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
spin_lock_rcu_node(sdp); /* Interrupts already disabled. */
@@ -445,6 +635,8 @@ static void srcu_gp_start(struct srcu_struct *ssp)
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
rcu_seq_snap(&ssp->srcu_gp_seq));
spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
+ WRITE_ONCE(ssp->srcu_gp_start, jiffies);
+ WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0);
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
rcu_seq_start(&ssp->srcu_gp_seq);
state = rcu_seq_state(ssp->srcu_gp_seq);
@@ -517,7 +709,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
int idx;
unsigned long mask;
struct srcu_data *sdp;
+ unsigned long sgsne;
struct srcu_node *snp;
+ int ss_state;
/* Prevent more than one additional grace period. */
mutex_lock(&ssp->srcu_cb_mutex);
@@ -526,7 +720,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
spin_lock_irq_rcu_node(ssp);
idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- cbdelay = srcu_get_delay(ssp);
+ cbdelay = !!srcu_get_delay(ssp);
WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
@@ -537,38 +731,45 @@ static void srcu_gp_end(struct srcu_struct *ssp)
/* A new grace period can start at this point. But only one. */
/* Initiate callback invocation as needed. */
- idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
- srcu_for_each_node_breadth_first(ssp, snp) {
- spin_lock_irq_rcu_node(snp);
- cbs = false;
- last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
- if (last_lvl)
- cbs = snp->srcu_have_cbs[idx] == gpseq;
- snp->srcu_have_cbs[idx] = gpseq;
- rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
- if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
- mask = snp->srcu_data_have_cbs[idx];
- snp->srcu_data_have_cbs[idx] = 0;
- spin_unlock_irq_rcu_node(snp);
- if (cbs)
- srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
-
- /* Occasionally prevent srcu_data counter wrap. */
- if (!(gpseq & counter_wrap_check) && last_lvl)
- for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irqsave_rcu_node(sdp, flags);
- if (ULONG_CMP_GE(gpseq,
- sdp->srcu_gp_seq_needed + 100))
- sdp->srcu_gp_seq_needed = gpseq;
- if (ULONG_CMP_GE(gpseq,
- sdp->srcu_gp_seq_needed_exp + 100))
- sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irqrestore_rcu_node(sdp, flags);
- }
+ ss_state = smp_load_acquire(&ssp->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
+ } else {
+ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+ srcu_for_each_node_breadth_first(ssp, snp) {
+ spin_lock_irq_rcu_node(snp);
+ cbs = false;
+ last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
+ if (last_lvl)
+ cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
+ snp->srcu_have_cbs[idx] = gpseq;
+ rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
+ if (ss_state < SRCU_SIZE_BIG)
+ mask = ~0;
+ else
+ mask = snp->srcu_data_have_cbs[idx];
+ snp->srcu_data_have_cbs[idx] = 0;
+ spin_unlock_irq_rcu_node(snp);
+ if (cbs)
+ srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
+ }
}
+ /* Occasionally prevent srcu_data counter wrap. */
+ if (!(gpseq & counter_wrap_check))
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_irqsave_rcu_node(sdp, flags);
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
+ sdp->srcu_gp_seq_needed = gpseq;
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
+ sdp->srcu_gp_seq_needed_exp = gpseq;
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ }
+
/* Callback initiation done, allow grace periods after next. */
mutex_unlock(&ssp->srcu_cb_mutex);
@@ -583,6 +784,14 @@ static void srcu_gp_end(struct srcu_struct *ssp)
} else {
spin_unlock_irq_rcu_node(ssp);
}
+
+ /* Transition to big if needed. */
+ if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) {
+ if (ss_state == SRCU_SIZE_ALLOC)
+ init_srcu_struct_nodes(ssp, GFP_KERNEL);
+ else
+ smp_store_release(&ssp->srcu_size_state, ss_state + 1);
+ }
}
/*
@@ -596,20 +805,24 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
unsigned long s)
{
unsigned long flags;
+ unsigned long sgsne;
- for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
- ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
- return;
- spin_lock_irqsave_rcu_node(snp, flags);
- if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+ if (snp)
+ for (; snp != NULL; snp = snp->srcu_parent) {
+ sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
+ if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
+ (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
+ return;
+ spin_lock_irqsave_rcu_node(snp, flags);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) {
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ return;
+ }
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(snp, flags);
- return;
}
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
- }
- spin_lock_irqsave_rcu_node(ssp, flags);
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(ssp, flags);
@@ -630,39 +843,47 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
{
unsigned long flags;
int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
- struct srcu_node *snp = sdp->mynode;
+ unsigned long sgsne;
+ struct srcu_node *snp;
+ struct srcu_node *snp_leaf;
unsigned long snp_seq;
- /* Each pass through the loop does one level of the srcu_node tree. */
- for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode)
- return; /* GP already done and CBs recorded. */
- spin_lock_irqsave_rcu_node(snp, flags);
- if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ snp_leaf = NULL;
+ else
+ snp_leaf = sdp->mynode;
+
+ if (snp_leaf)
+ /* Each pass through the loop does one level of the srcu_node tree. */
+ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
+ if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf)
+ return; /* GP already done and CBs recorded. */
+ spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
- if (snp == sdp->mynode && snp_seq == s)
- snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- spin_unlock_irqrestore_rcu_node(snp, flags);
- if (snp == sdp->mynode && snp_seq != s) {
- srcu_schedule_cbs_sdp(sdp, do_norm
- ? SRCU_INTERVAL
- : 0);
+ if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) {
+ if (snp == snp_leaf && snp_seq == s)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ if (snp == snp_leaf && snp_seq != s) {
+ srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0);
+ return;
+ }
+ if (!do_norm)
+ srcu_funnel_exp_start(ssp, snp, s);
return;
}
- if (!do_norm)
- srcu_funnel_exp_start(ssp, snp, s);
- return;
+ snp->srcu_have_cbs[idx] = s;
+ if (snp == snp_leaf)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s)))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
}
- snp->srcu_have_cbs[idx] = s;
- if (snp == sdp->mynode)
- snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
- }
/* Top of tree, must ensure the grace period will be started. */
- spin_lock_irqsave_rcu_node(ssp, flags);
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
@@ -678,9 +899,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
srcu_gp_start(ssp);
+
+ // And how can that list_add() in the "else" clause
+ // possibly be safe for concurrent execution? Well,
+ // it isn't. And it does not have to be. After all, it
+ // can only be executed during early boot when there is only
+ // the one boot CPU running with interrupts still disabled.
if (likely(srcu_init_done))
queue_delayed_work(rcu_gp_wq, &ssp->work,
- srcu_get_delay(ssp));
+ !!srcu_get_delay(ssp));
else if (list_empty(&ssp->work.work.entry))
list_add(&ssp->work.work.entry, &srcu_boot_list);
}
@@ -814,11 +1041,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
bool needgp = false;
unsigned long s;
struct srcu_data *sdp;
+ struct srcu_node *sdp_mynode;
+ int ss_state;
check_init_srcu_struct(ssp);
idx = srcu_read_lock(ssp);
- sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_rcu_node(sdp, flags);
+ ss_state = smp_load_acquire(&ssp->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_CALL)
+ sdp = per_cpu_ptr(ssp->sda, 0);
+ else
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
@@ -834,10 +1067,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
needexp = true;
}
spin_unlock_irqrestore_rcu_node(sdp, flags);
+
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER)
+ sdp_mynode = NULL;
+ else
+ sdp_mynode = sdp->mynode;
+
if (needgp)
srcu_funnel_gp_start(ssp, sdp, s, do_norm);
else if (needexp)
- srcu_funnel_exp_start(ssp, sdp->mynode, s);
+ srcu_funnel_exp_start(ssp, sdp_mynode, s);
srcu_read_unlock(ssp, idx);
return s;
}
@@ -1097,6 +1337,28 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
complete(&ssp->srcu_barrier_completion);
}
+/*
+ * Enqueue an srcu_barrier() callback on the specified srcu_data
+ * structure's ->cblist. but only if that ->cblist already has at least one
+ * callback enqueued. Note that if a CPU already has callbacks enqueue,
+ * it must have already registered the need for a future grace period,
+ * so all we need do is enqueue a callback that will use the same grace
+ * period as the last callback already in the queue.
+ */
+static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
+{
+ spin_lock_irq_rcu_node(sdp);
+ atomic_inc(&ssp->srcu_barrier_cpu_cnt);
+ sdp->srcu_barrier_head.func = srcu_barrier_cb;
+ debug_rcu_head_queue(&sdp->srcu_barrier_head);
+ if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+ &sdp->srcu_barrier_head)) {
+ debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
+ atomic_dec(&ssp->srcu_barrier_cpu_cnt);
+ }
+ spin_unlock_irq_rcu_node(sdp);
+}
+
/**
* srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
* @ssp: srcu_struct on which to wait for in-flight callbacks.
@@ -1104,7 +1366,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
void srcu_barrier(struct srcu_struct *ssp)
{
int cpu;
- struct srcu_data *sdp;
+ int idx;
unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
check_init_srcu_struct(ssp);
@@ -1120,27 +1382,13 @@ void srcu_barrier(struct srcu_struct *ssp)
/* Initial count prevents reaching zero until all CBs are posted. */
atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
- /*
- * Each pass through this loop enqueues a callback, but only
- * on CPUs already having callbacks enqueued. Note that if
- * a CPU already has callbacks enqueue, it must have already
- * registered the need for a future grace period, so all we
- * need do is enqueue a callback that will use the same
- * grace period as the last callback already in the queue.
- */
- for_each_possible_cpu(cpu) {
- sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irq_rcu_node(sdp);
- atomic_inc(&ssp->srcu_barrier_cpu_cnt);
- sdp->srcu_barrier_head.func = srcu_barrier_cb;
- debug_rcu_head_queue(&sdp->srcu_barrier_head);
- if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
- &sdp->srcu_barrier_head)) {
- debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
- atomic_dec(&ssp->srcu_barrier_cpu_cnt);
- }
- spin_unlock_irq_rcu_node(sdp);
- }
+ idx = srcu_read_lock(ssp);
+ if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
+ else
+ for_each_possible_cpu(cpu)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
+ srcu_read_unlock(ssp, idx);
/* Remove the initial count, at which point reaching zero can happen. */
if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
@@ -1214,6 +1462,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
srcu_flip(ssp);
spin_lock_irq_rcu_node(ssp);
rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
+ ssp->srcu_n_exp_nodelay = 0;
spin_unlock_irq_rcu_node(ssp);
}
@@ -1228,6 +1477,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
mutex_unlock(&ssp->srcu_gp_mutex);
return; /* readers present, retry later. */
}
+ ssp->srcu_n_exp_nodelay = 0;
srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */
}
}
@@ -1318,12 +1568,28 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
*/
static void process_srcu(struct work_struct *work)
{
+ unsigned long curdelay;
+ unsigned long j;
struct srcu_struct *ssp;
ssp = container_of(work, struct srcu_struct, work.work);
srcu_advance_state(ssp);
- srcu_reschedule(ssp, srcu_get_delay(ssp));
+ curdelay = srcu_get_delay(ssp);
+ if (curdelay) {
+ WRITE_ONCE(ssp->reschedule_count, 0);
+ } else {
+ j = jiffies;
+ if (READ_ONCE(ssp->reschedule_jiffies) == j) {
+ WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
+ if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
+ curdelay = 1;
+ } else {
+ WRITE_ONCE(ssp->reschedule_count, 1);
+ WRITE_ONCE(ssp->reschedule_jiffies, j);
+ }
+ }
+ srcu_reschedule(ssp, curdelay);
}
void srcutorture_get_gp_data(enum rcutorture_type test_type,
@@ -1337,43 +1603,69 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+static const char * const srcu_size_state_name[] = {
+ "SRCU_SIZE_SMALL",
+ "SRCU_SIZE_ALLOC",
+ "SRCU_SIZE_WAIT_BARRIER",
+ "SRCU_SIZE_WAIT_CALL",
+ "SRCU_SIZE_WAIT_CBS1",
+ "SRCU_SIZE_WAIT_CBS2",
+ "SRCU_SIZE_WAIT_CBS3",
+ "SRCU_SIZE_WAIT_CBS4",
+ "SRCU_SIZE_BIG",
+ "SRCU_SIZE_???",
+};
+
void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
{
int cpu;
int idx;
unsigned long s0 = 0, s1 = 0;
+ int ss_state = READ_ONCE(ssp->srcu_size_state);
+ int ss_state_idx = ss_state;
idx = ssp->srcu_idx & 0x1;
- pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):",
- tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx);
- for_each_possible_cpu(cpu) {
- unsigned long l0, l1;
- unsigned long u0, u1;
- long c0, c1;
- struct srcu_data *sdp;
-
- sdp = per_cpu_ptr(ssp->sda, cpu);
- u0 = data_race(sdp->srcu_unlock_count[!idx]);
- u1 = data_race(sdp->srcu_unlock_count[idx]);
-
- /*
- * Make sure that a lock is always counted if the corresponding
- * unlock is counted.
- */
- smp_rmb();
-
- l0 = data_race(sdp->srcu_lock_count[!idx]);
- l1 = data_race(sdp->srcu_lock_count[idx]);
-
- c0 = l0 - u0;
- c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld %c)",
- cpu, c0, c1,
- "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
- s0 += c0;
- s1 += c1;
+ if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
+ ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
+ pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
+ tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state,
+ srcu_size_state_name[ss_state_idx]);
+ if (!ssp->sda) {
+ // Called after cleanup_srcu_struct(), perhaps.
+ pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n");
+ } else {
+ pr_cont(" per-CPU(idx=%d):", idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *sdp;
+
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ u0 = data_race(sdp->srcu_unlock_count[!idx]);
+ u1 = data_race(sdp->srcu_unlock_count[idx]);
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = data_race(sdp->srcu_lock_count[!idx]);
+ l1 = data_race(sdp->srcu_lock_count[idx]);
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld %c)",
+ cpu, c0, c1,
+ "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
}
- pr_cont(" T(%ld,%ld)\n", s0, s1);
+ if (SRCU_SIZING_IS_TORTURE())
+ srcu_transition_to_big(ssp);
}
EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
@@ -1390,6 +1682,17 @@ void __init srcu_init(void)
{
struct srcu_struct *ssp;
+ /* Decide on srcu_struct-size strategy. */
+ if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
+ if (nr_cpu_ids >= big_cpu_lim) {
+ convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention.
+ pr_info("%s: Setting srcu_struct sizes to big.\n", __func__);
+ } else {
+ convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND;
+ pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__);
+ }
+ }
+
/*
* Once that is set, call_srcu() can follow the normal path and
* queue delayed work. This must follow RCU workqueues creation
@@ -1400,6 +1703,8 @@ void __init srcu_init(void)
ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
work.work.entry);
list_del_init(&ssp->work.work.entry);
+ if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL)
+ ssp->srcu_size_state = SRCU_SIZE_ALLOC;
queue_work(rcu_gp_wq, &ssp->work.work);
}
}
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 33d896d85902..5cefc702158f 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp)
* a slowpath during the update. After this function returns, all
* subsequent calls to rcu_sync_is_idle() will return false, which
* tells readers to stay off their fastpaths. A later call to
- * rcu_sync_exit() re-enables reader slowpaths.
+ * rcu_sync_exit() re-enables reader fastpaths.
*
* When called in isolation, rcu_sync_enter() must wait for a grace
* period, however, closely spaced calls to rcu_sync_enter() can
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 99cf3a13954c..3925e32159b5 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -46,7 +46,7 @@ struct rcu_tasks_percpu {
/**
* struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
- * @cbs_wq: Wait queue allowing new callback to get kthread's attention.
+ * @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
* @cbs_gbl_lock: Lock protecting callback list.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
@@ -77,7 +77,7 @@ struct rcu_tasks_percpu {
* @kname: This flavor's kthread name.
*/
struct rcu_tasks {
- struct wait_queue_head cbs_wq;
+ struct rcuwait cbs_wait;
raw_spinlock_t cbs_gbl_lock;
int gp_state;
int gp_sleep;
@@ -113,11 +113,11 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp);
#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \
.lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \
- .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \
+ .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \
}; \
static struct rcu_tasks rt_name = \
{ \
- .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \
+ .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \
.cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \
.gp_func = gp, \
.call_func = call, \
@@ -143,6 +143,11 @@ module_param(rcu_task_ipi_delay, int, 0644);
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
+#define RCU_TASK_STALL_INFO (HZ * 10)
+static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO;
+module_param(rcu_task_stall_info, int, 0644);
+static int rcu_task_stall_info_mult __read_mostly = 3;
+module_param(rcu_task_stall_info_mult, int, 0444);
static int rcu_task_enqueue_lim __read_mostly = -1;
module_param(rcu_task_enqueue_lim, int, 0444);
@@ -261,14 +266,16 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp)
struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work);
rtp = rtpcp->rtpp;
- wake_up(&rtp->cbs_wq);
+ rcuwait_wake_up(&rtp->cbs_wait);
}
// Enqueue a callback for the specified flavor of Tasks RCU.
static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
struct rcu_tasks *rtp)
{
+ int chosen_cpu;
unsigned long flags;
+ int ideal_cpu;
unsigned long j;
bool needadjust = false;
bool needwake;
@@ -278,8 +285,9 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
rhp->func = func;
local_irq_save(flags);
rcu_read_lock();
- rtpcp = per_cpu_ptr(rtp->rtpcpu,
- smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift));
+ ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift);
+ chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask);
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu);
if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
j = jiffies;
@@ -460,7 +468,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
}
}
- if (rcu_segcblist_empty(&rtpcp->cblist))
+ if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu))
return;
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
@@ -509,7 +517,9 @@ static int __noreturn rcu_tasks_kthread(void *arg)
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
/* If there were none, wait a bit and start over. */
- wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp)));
+ rcuwait_wait_event(&rtp->cbs_wait,
+ (needgpcb = rcu_tasks_need_gpcb(rtp)),
+ TASK_IDLE);
if (needgpcb & 0x2) {
// Wait for one grace period.
@@ -548,8 +558,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
static void __init rcu_tasks_bootup_oddness(void)
{
#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+ int rtsimc;
+
if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
+ rtsimc = clamp(rcu_task_stall_info_mult, 1, 10);
+ if (rtsimc != rcu_task_stall_info_mult) {
+ pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc);
+ rcu_task_stall_info_mult = rtsimc;
+ }
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_RCU
pr_info("\tTrampoline variant of Tasks RCU enabled.\n");
@@ -568,7 +585,17 @@ static void __init rcu_tasks_bootup_oddness(void)
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
- struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each...
+ int cpu;
+ bool havecbs = false;
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) {
+ havecbs = true;
+ break;
+ }
+ }
pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n",
rtp->kname,
tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
@@ -576,7 +603,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
data_race(rcu_seq_current(&rtp->tasks_gp_seq)),
data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
".k"[!!data_race(rtp->kthread_ptr)],
- ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))],
+ ".C"[havecbs],
s);
}
#endif // #ifndef CONFIG_TINY_RCU
@@ -592,10 +619,15 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t);
/* Wait for one RCU-tasks grace period. */
static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
{
- struct task_struct *g, *t;
- unsigned long lastreport;
- LIST_HEAD(holdouts);
+ struct task_struct *g;
int fract;
+ LIST_HEAD(holdouts);
+ unsigned long j;
+ unsigned long lastinfo;
+ unsigned long lastreport;
+ bool reported = false;
+ int rtsi;
+ struct task_struct *t;
set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP);
rtp->pregp_func();
@@ -621,30 +653,50 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
* is empty, we are done.
*/
lastreport = jiffies;
+ lastinfo = lastreport;
+ rtsi = READ_ONCE(rcu_task_stall_info);
// Start off with initial wait and slowly back off to 1 HZ wait.
fract = rtp->init_fract;
while (!list_empty(&holdouts)) {
+ ktime_t exp;
bool firstreport;
bool needreport;
int rtst;
- /* Slowly back off waiting for holdouts */
+ // Slowly back off waiting for holdouts
set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
- schedule_timeout_idle(fract);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ schedule_timeout_idle(fract);
+ } else {
+ exp = jiffies_to_nsecs(fract);
+ __set_current_state(TASK_IDLE);
+ schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD);
+ }
if (fract < HZ)
fract++;
rtst = READ_ONCE(rcu_task_stall_timeout);
needreport = rtst > 0 && time_after(jiffies, lastreport + rtst);
- if (needreport)
+ if (needreport) {
lastreport = jiffies;
+ reported = true;
+ }
firstreport = true;
WARN_ON(signal_pending(current));
set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS);
rtp->holdouts_func(&holdouts, needreport, &firstreport);
+
+ // Print pre-stall informational messages if needed.
+ j = jiffies;
+ if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) {
+ lastinfo = j;
+ rtsi = rtsi * rcu_task_stall_info_mult;
+ pr_info("%s: %s grace period %lu is %lu jiffies old.\n",
+ __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start);
+ }
}
set_tasks_gp_state(rtp, RTGS_POST_GP);
@@ -950,6 +1002,9 @@ static void rcu_tasks_be_rude(struct work_struct *work)
// Wait for one rude RCU-tasks grace period.
static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
{
+ if (num_online_cpus() <= 1)
+ return; // Fastpath for only one CPU.
+
rtp->n_ipis += cpumask_weight(cpu_online_mask);
schedule_on_each_cpu(rcu_tasks_be_rude);
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4b8189455d5..c25ba442044a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1679,6 +1679,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
+ if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
+ WRITE_ONCE(rdp->last_sched_clock, jiffies);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
return ret;
@@ -1705,11 +1707,37 @@ static void note_gp_changes(struct rcu_data *rdp)
rcu_gp_kthread_wake();
}
+static atomic_t *rcu_gp_slow_suppress;
+
+/* Register a counter to suppress debugging grace-period delays. */
+void rcu_gp_slow_register(atomic_t *rgssp)
+{
+ WARN_ON_ONCE(rcu_gp_slow_suppress);
+
+ WRITE_ONCE(rcu_gp_slow_suppress, rgssp);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
+
+/* Unregister a counter, with NULL for not caring which. */
+void rcu_gp_slow_unregister(atomic_t *rgssp)
+{
+ WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress);
+
+ WRITE_ONCE(rcu_gp_slow_suppress, NULL);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister);
+
+static bool rcu_gp_slow_is_suppressed(void)
+{
+ atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress);
+
+ return rgssp && atomic_read(rgssp);
+}
+
static void rcu_gp_slow(int delay)
{
- if (delay > 0 &&
- !(rcu_seq_ctr(rcu_state.gp_seq) %
- (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+ if (!rcu_gp_slow_is_suppressed() && delay > 0 &&
+ !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
schedule_timeout_idle(delay);
}
@@ -2096,14 +2124,29 @@ static noinline void rcu_gp_cleanup(void)
/* Advance CBs to reduce false positives below. */
offloaded = rcu_rdp_is_offloaded(rdp);
if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
+
+ // We get here if a grace period was needed (“needgp”)
+ // and the above call to rcu_accelerate_cbs() did not set
+ // the RCU_GP_FLAG_INIT bit in ->gp_state (which records
+ // the need for another grace period).  The purpose
+ // of the “offloaded” check is to avoid invoking
+ // rcu_accelerate_cbs() on an offloaded CPU because we do not
+ // hold the ->nocb_lock needed to safely access an offloaded
+ // ->cblist.  We do not want to acquire that lock because
+ // it can be heavily contended during callback floods.
+
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
- trace_rcu_grace_period(rcu_state.name,
- rcu_state.gp_seq,
- TPS("newreq"));
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
} else {
- WRITE_ONCE(rcu_state.gp_flags,
- rcu_state.gp_flags & RCU_GP_FLAG_INIT);
+
+ // We get here either if there is no need for an
+ // additional grace period or if rcu_accelerate_cbs() has
+ // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. 
+ // So all we need to do is to clear all of the other
+ // ->gp_flags bits.
+
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT);
}
raw_spin_unlock_irq_rcu_node(rnp);
@@ -2609,6 +2652,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
*/
void rcu_sched_clock_irq(int user)
{
+ unsigned long j;
+
+ if (IS_ENABLED(CONFIG_PROVE_RCU)) {
+ j = jiffies;
+ WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock)));
+ __this_cpu_write(rcu_data.last_sched_clock, j);
+ }
trace_rcu_utilization(TPS("Start scheduler-tick"));
lockdep_assert_irqs_disabled();
raw_cpu_inc(rcu_data.ticks_this_gp);
@@ -2624,6 +2674,8 @@ void rcu_sched_clock_irq(int user)
rcu_flavor_sched_clock_irq(user);
if (rcu_pending(user))
invoke_rcu_core();
+ if (user)
+ rcu_tasks_classic_qs(current, false);
lockdep_assert_irqs_disabled();
trace_rcu_utilization(TPS("End scheduler-tick"));
@@ -3717,7 +3769,9 @@ static int rcu_blocking_is_gp(void)
{
int ret;
- if (IS_ENABLED(CONFIG_PREEMPTION))
+ // Invoking preempt_model_*() too early gets a splat.
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE ||
+ preempt_model_full() || preempt_model_rt())
return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
might_sleep(); /* Check for RCU read-side critical section. */
preempt_disable();
@@ -4179,6 +4233,7 @@ rcu_boot_init_percpu_data(int cpu)
rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
+ rdp->last_sched_clock = jiffies;
rdp->cpu = cpu;
rcu_boot_init_nocb_percpu_data(rdp);
}
@@ -4471,6 +4526,51 @@ static int rcu_pm_notify(struct notifier_block *self,
return NOTIFY_OK;
}
+#ifdef CONFIG_RCU_EXP_KTHREAD
+struct kthread_worker *rcu_exp_gp_kworker;
+struct kthread_worker *rcu_exp_par_gp_kworker;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+ const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
+ const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
+ struct sched_param param = { .sched_priority = kthread_prio };
+
+ rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
+ if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+ pr_err("Failed to create %s!\n", gp_kworker_name);
+ return;
+ }
+
+ rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
+ if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
+ pr_err("Failed to create %s!\n", par_gp_kworker_name);
+ kthread_destroy_worker(rcu_exp_gp_kworker);
+ return;
+ }
+
+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+ sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
+ &param);
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+struct workqueue_struct *rcu_par_gp_wq;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+ rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_par_gp_wq);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
/*
* Spawn the kthreads that handle RCU's grace periods.
*/
@@ -4480,6 +4580,7 @@ static int __init rcu_spawn_gp_kthread(void)
struct rcu_node *rnp;
struct sched_param sp;
struct task_struct *t;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
rcu_scheduler_fully_active = 1;
t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
@@ -4497,9 +4598,17 @@ static int __init rcu_spawn_gp_kthread(void)
smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
- rcu_spawn_nocb_kthreads();
- rcu_spawn_boost_kthreads();
+ /* This is a pre-SMP initcall, we expect a single CPU */
+ WARN_ON(num_online_cpus() > 1);
+ /*
+ * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
+ * due to rcu_scheduler_fully_active.
+ */
+ rcu_spawn_cpu_nocb_kthread(smp_processor_id());
+ rcu_spawn_one_boost_kthread(rdp->mynode);
rcu_spawn_core_kthreads();
+ /* Create kthread worker for expedited GPs */
+ rcu_start_exp_gp_kworkers();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -4745,7 +4854,6 @@ static void __init rcu_dump_rcu_node_tree(void)
}
struct workqueue_struct *rcu_gp_wq;
-struct workqueue_struct *rcu_par_gp_wq;
static void __init kfree_rcu_batch_init(void)
{
@@ -4782,7 +4890,7 @@ static void __init kfree_rcu_batch_init(void)
void __init rcu_init(void)
{
- int cpu;
+ int cpu = smp_processor_id();
rcu_early_boot_tests();
@@ -4802,17 +4910,15 @@ void __init rcu_init(void)
* or the scheduler are operational.
*/
pm_notifier(rcu_pm_notify, 0);
- for_each_online_cpu(cpu) {
- rcutree_prepare_cpu(cpu);
- rcu_cpu_starting(cpu);
- rcutree_online_cpu(cpu);
- }
+ WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
+ rcutree_prepare_cpu(cpu);
+ rcu_cpu_starting(cpu);
+ rcutree_online_cpu(cpu);
/* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
- rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
- WARN_ON(!rcu_par_gp_wq);
+ rcu_alloc_par_gp_wq();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 926673ebe355..2ccf5845957d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -10,6 +10,7 @@
*/
#include <linux/cache.h>
+#include <linux/kthread.h>
#include <linux/spinlock.h>
#include <linux/rtmutex.h>
#include <linux/threads.h>
@@ -23,7 +24,11 @@
/* Communicate arguments to a workqueue handler. */
struct rcu_exp_work {
unsigned long rew_s;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+ struct kthread_work rew_work;
+#else
struct work_struct rew_work;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
};
/* RCU's kthread states for tracing. */
@@ -254,6 +259,7 @@ struct rcu_data {
unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */
short rcu_onl_gp_flags; /* ->gp_flags at last online. */
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
+ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
int cpu;
};
@@ -364,6 +370,7 @@ struct rcu_state {
arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
/* Synchronize offline with */
/* GP pre-initialization. */
+ int nocb_is_setup; /* nocb is setup from boot */
};
/* Values for rcu_state structure's gp_flags field. */
@@ -421,7 +428,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static bool rcu_is_callbacks_kthread(void);
static void rcu_cpu_kthread_setup(unsigned int cpu);
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
-static void __init rcu_spawn_boost_kthreads(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
@@ -439,7 +445,6 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
-static void __init rcu_spawn_nocb_kthreads(void);
static void show_rcu_nocb_state(struct rcu_data *rdp);
static void rcu_nocb_lock(struct rcu_data *rdp);
static void rcu_nocb_unlock(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 60197ea24ceb..0f70f62039a9 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -334,15 +334,13 @@ fastpath:
* Select the CPUs within the specified rcu_node that the upcoming
* expedited grace period needs to wait for.
*/
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
{
int cpu;
unsigned long flags;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
- struct rcu_exp_work *rewp =
- container_of(wp, struct rcu_exp_work, rew_work);
struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -417,13 +415,119 @@ retry_ipi:
rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
}
+static void rcu_exp_sel_wait_wake(unsigned long s);
+
+#ifdef CONFIG_RCU_EXP_KTHREAD
+static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
+{
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+
+ __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+ return !!READ_ONCE(rcu_exp_par_gp_kworker);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+ kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ /*
+ * Use rcu_exp_par_gp_kworker, because flushing a work item from
+ * another work item on the same kthread worker can result in
+ * deadlock.
+ */
+ kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+ kthread_flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct kthread_work *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+ kthread_init_work(&rew->rew_work, wait_rcu_exp_gp);
+ kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+{
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+
+ __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+ return !!READ_ONCE(rcu_par_gp_wq);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+ int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
+
+ INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ /* If all offline, queue the work on an unbound CPU. */
+ if (unlikely(cpu > rnp->grphi - rnp->grplo))
+ cpu = WORK_CPU_UNBOUND;
+ else
+ cpu += rnp->grplo;
+ queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+ flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct work_struct *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+ INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
+ queue_work(rcu_gp_wq, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+ destroy_work_on_stack(&rew->rew_work);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
/*
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
*/
static void sync_rcu_exp_select_cpus(void)
{
- int cpu;
struct rcu_node *rnp;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset"));
@@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- if (!READ_ONCE(rcu_par_gp_wq) ||
+ if (!rcu_gp_par_worker_started() ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
- /* No workqueues yet or last leaf, do direct call. */
+ /* No worker started yet or last leaf, do direct call. */
sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
continue;
}
- INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
- /* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi - rnp->grplo))
- cpu = WORK_CPU_UNBOUND;
- else
- cpu += rnp->grplo;
- queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+ sync_rcu_exp_select_cpus_queue_work(rnp);
rnp->exp_need_flush = true;
}
- /* Wait for workqueue jobs (if any) to complete. */
+ /* Wait for jobs (if any) to complete. */
rcu_for_each_leaf_node(rnp)
if (rnp->exp_need_flush)
- flush_work(&rnp->rew.rew_work);
+ sync_rcu_exp_select_cpus_flush_work(rnp);
}
/*
@@ -496,7 +593,7 @@ static void synchronize_rcu_expedited_wait(void)
struct rcu_node *rnp_root = rcu_get_root();
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
- jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_stall = rcu_exp_jiffies_till_stall_check();
jiffies_start = jiffies;
if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) {
if (synchronize_rcu_expedited_wait_once(1))
@@ -571,7 +668,7 @@ static void synchronize_rcu_expedited_wait(void)
dump_cpu_task(cpu);
}
}
- jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
}
}
@@ -622,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
rcu_exp_wait_wake(s);
}
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp;
-
- rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -848,20 +934,19 @@ void synchronize_rcu_expedited(void)
} else {
/* Marshall arguments & schedule the expedited grace period. */
rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew.rew_work);
+ synchronize_rcu_expedited_queue_work(&rew);
}
/* Wait for expedited grace period to complete. */
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s));
- smp_mb(); /* Workqueue actions happen before return. */
+ smp_mb(); /* Work actions happen before return. */
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
if (likely(!boottime))
- destroy_work_on_stack(&rew.rew_work);
+ synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 636d0546a4e9..46694e13398a 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -60,9 +60,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
* If the list is invalid, a warning is emitted and all CPUs are offloaded.
*/
-
-static bool rcu_nocb_is_setup;
-
static int __init rcu_nocb_setup(char *str)
{
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
@@ -72,7 +69,7 @@ static int __init rcu_nocb_setup(char *str)
cpumask_setall(rcu_nocb_mask);
}
}
- rcu_nocb_is_setup = true;
+ rcu_state.nocb_is_setup = true;
return 1;
}
__setup("rcu_nocbs", rcu_nocb_setup);
@@ -215,14 +212,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
init_swait_queue_head(&rnp->nocb_gp_wq[1]);
}
-/* Is the specified CPU a no-CBs CPU? */
-bool rcu_is_nocb_cpu(int cpu)
-{
- if (cpumask_available(rcu_nocb_mask))
- return cpumask_test_cpu(cpu, rcu_nocb_mask);
- return false;
-}
-
static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
struct rcu_data *rdp,
bool force, unsigned long flags)
@@ -1180,10 +1169,10 @@ void __init rcu_init_nohz(void)
return;
}
}
- rcu_nocb_is_setup = true;
+ rcu_state.nocb_is_setup = true;
}
- if (!rcu_nocb_is_setup)
+ if (!rcu_state.nocb_is_setup)
return;
#if defined(CONFIG_NO_HZ_FULL)
@@ -1241,7 +1230,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
struct task_struct *t;
struct sched_param sp;
- if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup)
+ if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup)
return;
/* If there already is an rcuo kthread, then nothing to do. */
@@ -1277,22 +1266,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
}
-/*
- * Once the scheduler is running, spawn rcuo kthreads for all online
- * no-CBs CPUs. This assumes that the early_initcall()s happen before
- * non-boot CPUs come online -- if this changes, we will need to add
- * some mutual exclusion.
- */
-static void __init rcu_spawn_nocb_kthreads(void)
-{
- int cpu;
-
- if (rcu_nocb_is_setup) {
- for_each_online_cpu(cpu)
- rcu_spawn_cpu_nocb_kthread(cpu);
- }
-}
-
/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
static int rcu_nocb_gp_stride = -1;
module_param(rcu_nocb_gp_stride, int, 0444);
@@ -1549,10 +1522,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
{
}
-static void __init rcu_spawn_nocb_kthreads(void)
-{
-}
-
static void show_rcu_nocb_state(struct rcu_data *rdp)
{
}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8360d86db1c0..c8ba0fe17267 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -486,6 +486,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
t->rcu_read_unlock_special.s = 0;
if (special.b.need_qs) {
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
+ rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
udelay(rcu_unlock_delay);
} else {
@@ -660,7 +661,13 @@ static void rcu_read_unlock_special(struct task_struct *t)
expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler);
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ IS_ENABLED(CONFIG_PREEMPT_RT))
+ rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
+ rcu_preempt_deferred_qs_handler);
+ else
+ init_irq_work(&rdp->defer_qs_iw,
+ rcu_preempt_deferred_qs_handler);
rdp->defer_qs_iw_pending = true;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
@@ -1124,7 +1131,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
raw_lockdep_assert_held_rcu_node(rnp);
- if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+ if (!rnp->boost_kthread_task ||
+ (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -1226,18 +1234,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
free_cpumask_var(cm);
}
-/*
- * Spawn boost kthreads -- called as soon as the scheduler is running.
- */
-static void __init rcu_spawn_boost_kthreads(void)
-{
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(rnp)
- if (rcu_rnp_online_cpus(rnp))
- rcu_spawn_one_boost_kthread(rnp);
-}
-
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1263,10 +1259,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
-static void __init rcu_spawn_boost_kthreads(void)
-{
-}
-
#endif /* #else #ifdef CONFIG_RCU_BOOST */
/*
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index d612707c2ed0..4995c078cff9 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -25,6 +25,34 @@ int sysctl_max_rcu_stall_to_panic __read_mostly;
#define RCU_STALL_MIGHT_DIV 8
#define RCU_STALL_MIGHT_MIN (2 * HZ)
+int rcu_exp_jiffies_till_stall_check(void)
+{
+ int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout);
+ int exp_stall_delay_delta = 0;
+ int till_stall_check;
+
+ // Zero says to use rcu_cpu_stall_timeout, but in milliseconds.
+ if (!cpu_stall_timeout)
+ cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check());
+
+ // Limit check must be consistent with the Kconfig limits for
+ // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.
+ // The minimum clamped value is "2UL", because at least one full
+ // tick has to be guaranteed.
+ till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ);
+
+ if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)
+ WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check));
+
+#ifdef CONFIG_PROVE_RCU
+ /* Add extra ~25% out of till_stall_check. */
+ exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1;
+#endif
+
+ return till_stall_check + exp_stall_delay_delta;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check);
+
/* Limit-check stall timeouts specified at boottime and runtime. */
int rcu_jiffies_till_stall_check(void)
{
@@ -565,9 +593,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+ pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
smp_processor_id(), (long)(jiffies - gps),
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
if (ndetected) {
rcu_dump_cpu_stacks();
@@ -627,9 +655,9 @@ static void print_cpu_stall(unsigned long gps)
raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
+ pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
jiffies - gps,
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 180ff9c41fa8..fc7fef575606 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -506,6 +506,8 @@ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
module_param(rcu_cpu_stall_suppress, int, 0644);
int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
+int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT;
+module_param(rcu_exp_cpu_stall_timeout, int, 0644);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall
diff --git a/kernel/resource.c b/kernel/resource.c
index 9c08d6e9eef2..34eaee179689 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -56,14 +56,6 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
-/*
- * For memory hotplug, there is no way to free resource entries allocated
- * by boot mem after the system is up. So for reusing the resource entry
- * we need to remember the resource.
- */
-static struct resource *bootmem_resource_free;
-static DEFINE_SPINLOCK(bootmem_resource_lock);
-
static struct resource *next_resource(struct resource *p)
{
if (p->child)
@@ -160,36 +152,19 @@ __initcall(ioresources_init);
static void free_resource(struct resource *res)
{
- if (!res)
- return;
-
- if (!PageSlab(virt_to_head_page(res))) {
- spin_lock(&bootmem_resource_lock);
- res->sibling = bootmem_resource_free;
- bootmem_resource_free = res;
- spin_unlock(&bootmem_resource_lock);
- } else {
+ /**
+ * If the resource was allocated using memblock early during boot
+ * we'll leak it here: we can only return full pages back to the
+ * buddy and trying to be smart and reusing them eventually in
+ * alloc_resource() overcomplicates resource handling.
+ */
+ if (res && PageSlab(virt_to_head_page(res)))
kfree(res);
- }
}
static struct resource *alloc_resource(gfp_t flags)
{
- struct resource *res = NULL;
-
- spin_lock(&bootmem_resource_lock);
- if (bootmem_resource_free) {
- res = bootmem_resource_free;
- bootmem_resource_free = res->sibling;
- }
- spin_unlock(&bootmem_resource_lock);
-
- if (res)
- memset(res, 0, sizeof(struct resource));
- else
- res = kzalloc(sizeof(struct resource), flags);
-
- return res;
+ return kzalloc(sizeof(struct resource), flags);
}
/* Return the conflict entry if you can't request it */
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index dcb0410950e4..5d113aa59e77 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -267,9 +267,10 @@ static void scf_handler(void *scfc_in)
}
this_cpu_inc(scf_invoked_count);
if (longwait <= 0) {
- if (!(r & 0xffc0))
+ if (!(r & 0xffc0)) {
udelay(r & 0x3f);
- goto out;
+ goto out;
+ }
}
if (r & 0xfff)
goto out;
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index e0104b45029a..d9dc9ab3773f 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -15,6 +15,7 @@
/* Headers: */
#include <linux/sched/clock.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/hotplug.h>
#include <linux/sched/posix-timers.h>
#include <linux/sched/rt.h>
@@ -31,6 +32,7 @@
#include <uapi/linux/sched/types.h>
#include "sched.h"
+#include "smp.h"
#include "autogroup.h"
#include "stats.h"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index eec0849b2aae..99bdd96f454f 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -14,6 +14,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>
+#include <linux/sched/nohz.h>
#include <linux/sched/mm.h>
#include <linux/sched/rseq_api.h>
#include <linux/sched/task_stack.h>
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index d9272d9061a3..e374c0c923da 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -287,7 +287,7 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+ if (!try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
@@ -349,7 +349,7 @@ again:
val = remote_clock;
}
- if (cmpxchg64(ptr, old_val, val) != old_val)
+ if (!try_cmpxchg64(ptr, &old_val, val))
goto again;
return val;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d575b4914925..a247f8d9d417 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,7 +26,10 @@
#include <linux/topology.h>
#include <linux/sched/clock.h>
#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/init.h>
#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
@@ -610,10 +613,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
swap(rq1, rq2);
raw_spin_rq_lock(rq1);
- if (__rq_lockp(rq1) == __rq_lockp(rq2))
- return;
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(rq1, rq2);
}
#endif
@@ -2190,7 +2193,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- else if (p->sched_class > rq->curr->sched_class)
+ else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
/*
@@ -2408,7 +2411,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -5689,7 +5692,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely(prev->sched_class <= &fair_sched_class &&
+ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
@@ -5752,6 +5755,8 @@ static inline struct task_struct *pick_task(struct rq *rq)
extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+static void queue_core_balance(struct rq *rq);
+
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -5801,7 +5806,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
rq->core_pick = NULL;
- return next;
+ goto out;
}
put_prev_task_balance(rq, prev, rf);
@@ -5851,7 +5856,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*/
WARN_ON_ONCE(fi_before);
task_vruntime_update(rq, next, false);
- goto done;
+ goto out_set_next;
}
}
@@ -5970,8 +5975,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
resched_curr(rq_i);
}
-done:
+out_set_next:
set_next_task(rq, next);
+out:
+ if (rq->core->core_forceidle_count && next == rq->idle)
+ queue_core_balance(rq);
+
return next;
}
@@ -6000,7 +6009,7 @@ static bool try_steal_cookie(int this, int that)
if (p == src->core_pick || p == src->curr)
goto next;
- if (!cpumask_test_cpu(this, &p->cpus_mask))
+ if (!is_cpu_allowed(p, this))
goto next;
if (p->core_occupation > dst->idle->core_occupation)
@@ -6066,7 +6075,7 @@ static void sched_core_balance(struct rq *rq)
static DEFINE_PER_CPU(struct callback_head, core_balance_head);
-void queue_core_balance(struct rq *rq)
+static void queue_core_balance(struct rq *rq)
{
if (!sched_core_enabled(rq))
return;
@@ -6376,7 +6385,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
@@ -8409,6 +8418,18 @@ static void __init preempt_dynamic_init(void)
}
}
+#define PREEMPT_MODEL_ACCESSOR(mode) \
+ bool preempt_model_##mode(void) \
+ { \
+ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
+ return preempt_dynamic_mode == preempt_dynamic_##mode; \
+ } \
+ EXPORT_SYMBOL_GPL(preempt_model_##mode)
+
+PREEMPT_MODEL_ACCESSOR(none);
+PREEMPT_MODEL_ACCESSOR(voluntary);
+PREEMPT_MODEL_ACCESSOR(full);
+
#else /* !CONFIG_PREEMPT_DYNAMIC */
static inline void preempt_dynamic_init(void) { }
@@ -9451,11 +9472,11 @@ void __init sched_init(void)
int i;
/* Make sure the linker didn't screw up */
- BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
- &fair_sched_class + 1 != &rt_sched_class ||
- &rt_sched_class + 1 != &dl_sched_class);
+ BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
+ &fair_sched_class != &rt_sched_class + 1 ||
+ &rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP
- BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
+ BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif
wait_bit_init();
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb4255ae0b2c..936817ae142f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1220,8 +1220,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
return (dl_se->runtime <= 0);
}
-extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-
/*
* This function implements the GRUB accounting rule:
* according to the GRUB reclaiming algorithm, the runtime is
@@ -1832,6 +1830,7 @@ out:
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
+ struct rq_flags rf;
struct rq *rq;
if (READ_ONCE(p->__state) != TASK_WAKING)
@@ -1843,7 +1842,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
- raw_spin_rq_lock(rq);
+ rq_lock(rq, &rf);
if (p->dl.dl_non_contending) {
update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
@@ -1859,7 +1858,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
put_task_struct(p);
}
sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_rq_unlock(rq);
+ rq_unlock(rq, &rf);
}
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
@@ -2319,13 +2318,7 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
-
- /*
- * Update the later_rq clock here, because the clock is used
- * by the cpufreq_update_util() inside __add_running_bw().
- */
- update_rq_clock(later_rq);
- activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
+ activate_task(later_rq, next_task, 0);
ret = 1;
resched_curr(later_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee0664c9d291..906b2c7c48d1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -36,6 +36,7 @@
#include <linux/sched/cond_resched.h>
#include <linux/sched/cputime.h>
#include <linux/sched/isolation.h>
+#include <linux/sched/nohz.h>
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
@@ -44,6 +45,7 @@
#include <linux/profile.h>
#include <linux/psi.h>
#include <linux/ratelimit.h>
+#include <linux/task_work.h>
#include <asm/switch_to.h>
@@ -312,19 +314,6 @@ const struct sched_class fair_sched_class;
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (!path)
- return;
-
- if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
- autogroup_path(cfs_rq->tg, path, len);
- else if (cfs_rq && cfs_rq->tg->css.cgroup)
- cgroup_path(cfs_rq->tg->css.cgroup, path, len);
- else
- strlcpy(path, "(null)", len);
-}
-
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -492,12 +481,6 @@ static int se_is_idle(struct sched_entity *se)
#define for_each_sched_entity(se) \
for (; se; se = NULL)
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (path)
- strlcpy(path, "(null)", len);
-}
-
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
return true;
@@ -3828,11 +3811,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
se->avg.runnable_sum = se->avg.runnable_avg * divider;
- se->avg.load_sum = divider;
- if (se_weight(se)) {
- se->avg.load_sum =
- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
- }
+ se->avg.load_sum = se->avg.load_avg * divider;
+ if (se_weight(se) < se->avg.load_sum)
+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+ else
+ se->avg.load_sum = 1;
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
@@ -4845,11 +4828,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
- cfs_rq->throttled_clock_task;
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
/* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
+ if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
@@ -4863,7 +4846,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
}
cfs_rq->throttle_count++;
@@ -5307,7 +5290,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -6543,108 +6526,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
/*
- * cpu_util_without: compute cpu utilization without any contributions from *p
- * @cpu: the CPU which utilization is requested
- * @p: the task which utilization should be discounted
- *
- * The utilization of a CPU is defined by the utilization of tasks currently
- * enqueued on that CPU as well as tasks which are currently sleeping after an
- * execution on that CPU.
- *
- * This method returns the utilization of the specified CPU by discounting the
- * utilization of the specified task, whenever the task is currently
- * contributing to the CPU utilization.
- */
-static unsigned long cpu_util_without(int cpu, struct task_struct *p)
-{
- struct cfs_rq *cfs_rq;
- unsigned int util;
-
- /* Task has no contribution or is new */
- if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util_cfs(cpu);
-
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
-
- /* Discount task's util from CPU's util */
- lsub_positive(&util, task_util(p));
-
- /*
- * Covered cases:
- *
- * a) if *p is the only task sleeping on this CPU, then:
- * cpu_util (== task_util) > util_est (== 0)
- * and thus we return:
- * cpu_util_without = (cpu_util - task_util) = 0
- *
- * b) if other tasks are SLEEPING on this CPU, which is now exiting
- * IDLE, then:
- * cpu_util >= task_util
- * cpu_util > util_est (== 0)
- * and thus we discount *p's blocked utilization to return:
- * cpu_util_without = (cpu_util - task_util) >= 0
- *
- * c) if other tasks are RUNNABLE on that CPU and
- * util_est > cpu_util
- * then we use util_est since it returns a more restrictive
- * estimation of the spare capacity on that CPU, by just
- * considering the expected utilization of tasks already
- * runnable on that CPU.
- *
- * Cases a) and b) are covered by the above code, while case c) is
- * covered by the following code when estimated utilization is
- * enabled.
- */
- if (sched_feat(UTIL_EST)) {
- unsigned int estimated =
- READ_ONCE(cfs_rq->avg.util_est.enqueued);
-
- /*
- * Despite the following checks we still have a small window
- * for a possible race, when an execl's select_task_rq_fair()
- * races with LB's detach_task():
- *
- * detach_task()
- * p->on_rq = TASK_ON_RQ_MIGRATING;
- * ---------------------------------- A
- * deactivate_task() \
- * dequeue_task() + RaceTime
- * util_est_dequeue() /
- * ---------------------------------- B
- *
- * The additional check on "current == p" it's required to
- * properly fix the execl regression and it helps in further
- * reducing the chances for the above race.
- */
- if (unlikely(task_on_rq_queued(p) || current == p))
- lsub_positive(&estimated, _task_util_est(p));
-
- util = max(util, estimated);
- }
-
- /*
- * Utilization (estimated) can exceed the CPU capacity, thus let's
- * clamp to the maximum CPU capacity to ensure consistency with
- * cpu_util.
- */
- return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
-/*
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- * to @dst_cpu.
+ * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
+ * (@dst_cpu = -1) or migrated to @dst_cpu.
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
- unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
/*
- * If @p migrates from @cpu to another, remove its contribution. Or,
- * if @p migrates from another CPU to @cpu, add its contribution. In
- * the other cases, @cpu is not impacted by the migration, so the
- * util_avg should already be correct.
+ * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
+ * contribution. If @p migrates from another CPU to @cpu add its
+ * contribution. In all the other cases @cpu is not impacted by the
+ * migration so its util_avg is already correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
@@ -6652,16 +6546,40 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
+ unsigned long util_est;
+
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
- * During wake-up, the task isn't enqueued yet and doesn't
- * appear in the cfs_rq->avg.util_est.enqueued of any rq,
- * so just add it (if needed) to "simulate" what will be
- * cpu_util after the task has been enqueued.
+ * During wake-up @p isn't enqueued yet and doesn't contribute
+ * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+ * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
+ * has been enqueued.
+ *
+ * During exec (@dst_cpu = -1) @p is enqueued and does
+ * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+ * Remove it to "simulate" cpu_util without @p's contribution.
+ *
+ * Despite the task_on_rq_queued(@p) check there is still a
+ * small window for a possible race when an exec
+ * select_task_rq_fair() races with LB's detach_task().
+ *
+ * detach_task()
+ * deactivate_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * -------------------------------- A
+ * dequeue_task() \
+ * dequeue_task_fair() + Race Time
+ * util_est_dequeue() /
+ * -------------------------------- B
+ *
+ * The additional check "current == p" is required to further
+ * reduce the race window.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
+ else if (unlikely(task_on_rq_queued(p) || current == p))
+ lsub_positive(&util_est, _task_util_est(p));
util = max(util, util_est);
}
@@ -6670,6 +6588,28 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
}
/*
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
+ */
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+{
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_util_cfs(cpu);
+
+ return cpu_util_next(cpu, p, -1);
+}
+
+/*
* compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
* landscape of @pd's CPUs after the task migration, and uses the Energy Model
@@ -9459,8 +9399,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
local->group_capacity;
- sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
- sds->total_capacity;
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
@@ -9469,6 +9407,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
env->imbalance = 0;
return;
}
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
}
/*
@@ -9494,7 +9435,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
* has_spare nr_idle balanced N/A N/A balanced balanced
* fully_busy nr_idle nr_idle N/A N/A balanced balanced
- * misfit_task force N/A N/A N/A force force
+ * misfit_task force N/A N/A N/A N/A N/A
* asym_packing force force N/A N/A force force
* imbalanced force force N/A N/A force force
* overloaded force force N/A N/A force avg_load
@@ -11880,101 +11821,3 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
-
-/*
- * Helper functions to facilitate extracting info from tracepoints.
- */
-
-const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
-{
-#ifdef CONFIG_SMP
- return cfs_rq ? &cfs_rq->avg : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
-
-char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
-{
- if (!cfs_rq) {
- if (str)
- strlcpy(str, "(null)", len);
- else
- return NULL;
- }
-
- cfs_rq_tg_path(cfs_rq, str, len);
- return str;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
-
-int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
-{
- return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
-
-const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_rt : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
-
-const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_dl : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
-
-const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
- return rq ? &rq->avg_irq : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
-
-int sched_trace_rq_cpu(struct rq *rq)
-{
- return rq ? cpu_of(rq) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
-
-int sched_trace_rq_cpu_capacity(struct rq *rq)
-{
- return rq ?
-#ifdef CONFIG_SMP
- rq->cpu_capacity
-#else
- SCHED_CAPACITY_SCALE
-#endif
- : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
-
-const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
-{
-#ifdef CONFIG_SMP
- return rd ? rd->span : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rd_span);
-
-int sched_trace_rq_nr_running(struct rq *rq)
-{
- return rq ? rq->nr_running : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f8b5020e76a..328cccbee444 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -102,7 +102,7 @@ void __cpuidle default_idle_call(void)
* last -- this is very similar to the entry code.
*/
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(_THIS_IP_);
+ lockdep_hardirqs_on_prepare();
rcu_idle_enter();
lockdep_hardirqs_on(_THIS_IP_);
@@ -327,7 +327,7 @@ static void do_idle(void)
* RCU relies on this call to be done outside of an RCU read-side
* critical section.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
@@ -434,7 +434,6 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
- queue_core_balance(rq);
}
#ifdef CONFIG_SMP
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index c336f5f481bc..4ff2ed4f8fa1 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -145,9 +145,9 @@ static inline u64 rq_clock_pelt(struct rq *rq)
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+ return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
- return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a4fa3aadfcba..a337f3e35997 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1060,14 +1060,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2; full++) {
- unsigned long avg[3];
- u64 total;
+ unsigned long avg[3] = { 0, };
+ u64 total = 0;
int w;
- for (w = 0; w < 3; w++)
- avg[w] = group->avg[res * 2 + full][w];
- total = div_u64(group->total[PSI_AVGS][res * 2 + full],
- NSEC_PER_USEC);
+ /* CPU FULL is undefined at the system level */
+ if (!(group == &psi_system && res == PSI_CPU && full)) {
+ for (w = 0; w < 3; w++)
+ avg[w] = group->avg[res * 2 + full][w];
+ total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+ NSEC_PER_USEC);
+ }
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some",
@@ -1117,7 +1120,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->state = state;
t->threshold = threshold_us * NSEC_PER_USEC;
t->win.size = window_us * NSEC_PER_USEC;
- window_reset(&t->win, 0, 0, 0);
+ window_reset(&t->win, sched_clock(),
+ group->total[PSI_POLL][t->state], 0);
t->event = 0;
t->last_event_time = 0;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a32c46889af8..7891c0f0e1ff 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -871,6 +871,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct rq_flags rf;
int skip;
/*
@@ -885,7 +886,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip)
continue;
- raw_spin_rq_lock(rq);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
if (rt_rq->rt_time) {
@@ -923,7 +924,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_rq_unlock(rq);
+ rq_unlock(rq, &rf);
}
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 58263f90c559..2ce18584dca3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -603,8 +603,8 @@ struct cfs_rq {
s64 runtime_remaining;
u64 throttled_clock;
- u64 throttled_clock_task;
- u64 throttled_clock_task_time;
+ u64 throttled_clock_pelt;
+ u64 throttled_clock_pelt_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
@@ -1232,8 +1232,6 @@ static inline bool sched_group_cookie_match(struct rq *rq,
return false;
}
-extern void queue_core_balance(struct rq *rq);
-
static inline bool sched_core_enqueued(struct task_struct *p)
{
return !RB_EMPTY_NODE(&p->core_node);
@@ -1267,10 +1265,6 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
return &rq->__lock;
}
-static inline void queue_core_balance(struct rq *rq)
-{
-}
-
static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
{
return true;
@@ -1833,12 +1827,7 @@ static inline void dirty_sched_domain_sysctl(int cpu)
#endif
extern int sched_update_scaling(void);
-
-extern void flush_smp_call_function_from_idle(void);
-
-#else /* !CONFIG_SMP: */
-static inline void flush_smp_call_function_from_idle(void) { }
-#endif
+#endif /* CONFIG_SMP */
#include "stats.h"
@@ -2188,6 +2177,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
*
* include/asm-generic/vmlinux.lds.h
*
+ * *CAREFUL* they are laid out in *REVERSE* order!!!
+ *
* Also enforce alignment on the instance, not the type, to guarantee layout.
*/
#define DEFINE_SCHED_CLASS(name) \
@@ -2196,17 +2187,16 @@ const struct sched_class name##_sched_class \
__section("__" #name "_sched_class")
/* Defined in include/asm-generic/vmlinux.lds.h */
-extern struct sched_class __begin_sched_classes[];
-extern struct sched_class __end_sched_classes[];
-
-#define sched_class_highest (__end_sched_classes - 1)
-#define sched_class_lowest (__begin_sched_classes - 1)
+extern struct sched_class __sched_class_highest[];
+extern struct sched_class __sched_class_lowest[];
#define for_class_range(class, _from, _to) \
- for (class = (_from); class != (_to); class--)
+ for (class = (_from); class < (_to); class++)
#define for_each_class(class) \
- for_class_range(class, sched_class_highest, sched_class_lowest)
+ for_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+#define sched_class_above(_a, _b) ((_a) < (_b))
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -2315,6 +2305,7 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
@@ -2484,6 +2475,24 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
+ * acquire rq lock instead of rq_lock(). So at the end of these two functions
+ * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of
+ * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
+ */
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
+{
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
+#ifdef CONFIG_SMP
+ rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
+}
+#else
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
+#endif
#ifdef CONFIG_SMP
@@ -2549,14 +2558,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- if (__rq_lockp(this_rq) == __rq_lockp(busiest))
- return 0;
-
- if (likely(raw_spin_rq_trylock(busiest)))
+ if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
+ likely(raw_spin_rq_trylock(busiest))) {
+ double_rq_clock_clear_update(this_rq, busiest);
return 0;
+ }
if (rq_order_less(this_rq, busiest)) {
raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(this_rq, busiest);
return 0;
}
@@ -2650,6 +2660,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
BUG_ON(rq1 != rq2);
raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
+ double_rq_clock_clear_update(rq1, rq2);
}
/*
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 9620e323162c..2eb23dd0f285 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -7,3 +7,9 @@
extern void sched_ttwu_pending(void *arg);
extern void send_call_function_single_ipi(int cpu);
+
+#ifdef CONFIG_SMP
+extern void flush_smp_call_function_queue(void);
+#else
+static inline void flush_smp_call_function_queue(void) { }
+#endif
diff --git a/kernel/scs.c b/kernel/scs.c
index 579841be8864..b7e1b096d906 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -32,15 +32,19 @@ static void *__scs_alloc(int node)
for (i = 0; i < NR_CACHED_SCS; i++) {
s = this_cpu_xchg(scs_cache[i], NULL);
if (s) {
- kasan_unpoison_vmalloc(s, SCS_SIZE);
+ s = kasan_unpoison_vmalloc(s, SCS_SIZE,
+ KASAN_VMALLOC_PROT_NORMAL);
memset(s, 0, SCS_SIZE);
- return s;
+ goto out;
}
}
- return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
+ s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
GFP_SCS, PAGE_KERNEL, 0, node,
__builtin_return_address(0));
+
+out:
+ return kasan_reset_tag(s);
}
void *scs_alloc(int node)
@@ -78,7 +82,7 @@ void scs_free(void *s)
if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
return;
- kasan_unpoison_vmalloc(s, SCS_SIZE);
+ kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_PROT_NORMAL);
vfree_atomic(s);
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index db10e73d06e0..e9852d1b4a5e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -42,7 +42,6 @@
#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/capability.h>
-#include <linux/tracehook.h>
#include <linux/uaccess.h>
#include <linux/anon_inodes.h>
#include <linux/lockdep.h>
@@ -201,6 +200,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
* the filter can be freed.
* @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
+ * @wait_killable_recv: Put notifying process in killable state once the
+ * notification is received by the userspace listener.
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
* @notif: the struct that holds all notification related information
@@ -221,6 +222,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
+ bool wait_killable_recv;
struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
@@ -894,6 +896,10 @@ static long seccomp_attach_filter(unsigned int flags,
if (flags & SECCOMP_FILTER_FLAG_LOG)
filter->log = true;
+ /* Set wait killable flag, if present. */
+ if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
+ filter->wait_killable_recv = true;
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -1081,6 +1087,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
complete(&addfd->completion);
}
+static bool should_sleep_killable(struct seccomp_filter *match,
+ struct seccomp_knotif *n)
+{
+ return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
+}
+
static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
const struct seccomp_data *sd)
@@ -1101,7 +1113,7 @@ static int seccomp_do_user_notification(int this_syscall,
n.data = sd;
n.id = seccomp_next_notify_id(match);
init_completion(&n.ready);
- list_add(&n.list, &match->notif->notifications);
+ list_add_tail(&n.list, &match->notif->notifications);
INIT_LIST_HEAD(&n.addfd);
up(&match->notif->request);
@@ -1111,11 +1123,25 @@ static int seccomp_do_user_notification(int this_syscall,
* This is where we wait for a reply from userspace.
*/
do {
+ bool wait_killable = should_sleep_killable(match, &n);
+
mutex_unlock(&match->notify_lock);
- err = wait_for_completion_interruptible(&n.ready);
+ if (wait_killable)
+ err = wait_for_completion_killable(&n.ready);
+ else
+ err = wait_for_completion_interruptible(&n.ready);
mutex_lock(&match->notify_lock);
- if (err != 0)
+
+ if (err != 0) {
+ /*
+ * Check to see if the notifcation got picked up and
+ * whether we should switch to wait killable.
+ */
+ if (!wait_killable && should_sleep_killable(match, &n))
+ continue;
+
goto interrupted;
+ }
addfd = list_first_entry_or_null(&n.addfd,
struct seccomp_kaddfd, list);
@@ -1485,6 +1511,9 @@ out:
mutex_lock(&filter->notify_lock);
knotif = find_notification(filter, unotif.id);
if (knotif) {
+ /* Reset the process to make sure it's not stuck */
+ if (should_sleep_killable(filter, knotif))
+ complete(&knotif->ready);
knotif->state = SECCOMP_NOTIFY_INIT;
up(&filter->notif->request);
}
@@ -1830,6 +1859,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
return -EINVAL;
+ /*
+ * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
+ * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
+ */
+ if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
+ ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
+ return -EINVAL;
+
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
diff --git a/kernel/signal.c b/kernel/signal.c
index e93de6daa188..e43bc2a692f5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -32,7 +32,7 @@
#include <linux/signal.h>
#include <linux/signalfd.h>
#include <linux/ratelimit.h>
-#include <linux/tracehook.h>
+#include <linux/task_work.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
@@ -1308,43 +1308,6 @@ enum sig_handler {
};
/*
- * On some archictectures, PREEMPT_RT has to delay sending a signal from a
- * trap since it cannot enable preemption, and the signal code's
- * spin_locks turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME
- * which will send the signal on exit of the trap.
- */
-#ifdef CONFIG_RT_DELAYED_SIGNALS
-static inline bool force_sig_delayed(struct kernel_siginfo *info,
- struct task_struct *t)
-{
- if (!in_atomic())
- return false;
-
- if (WARN_ON_ONCE(t->forced_info.si_signo))
- return true;
-
- if (is_si_special(info)) {
- WARN_ON_ONCE(info != SEND_SIG_PRIV);
- t->forced_info.si_signo = info->si_signo;
- t->forced_info.si_errno = 0;
- t->forced_info.si_code = SI_KERNEL;
- t->forced_info.si_pid = 0;
- t->forced_info.si_uid = 0;
- } else {
- t->forced_info = *info;
- }
- set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
- return true;
-}
-#else
-static inline bool force_sig_delayed(struct kernel_siginfo *info,
- struct task_struct *t)
-{
- return false;
-}
-#endif
-
-/*
* Force a signal that the process can't ignore: if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL.
*
@@ -1364,9 +1327,6 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
struct k_sigaction *action;
int sig = info->si_signo;
- if (force_sig_delayed(info, t))
- return 0;
-
spin_lock_irqsave(&t->sighand->siglock, flags);
action = &t->sighand->action[sig-1];
ignored = action->sa.sa_handler == SIG_IGN;
@@ -1845,7 +1805,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey)
}
#endif
-int force_sig_perf(void __user *addr, u32 type, u64 sig_data)
+int send_sig_perf(void __user *addr, u32 type, u64 sig_data)
{
struct kernel_siginfo info;
@@ -1857,7 +1817,18 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data)
info.si_perf_data = sig_data;
info.si_perf_type = type;
- return force_sig_info(&info);
+ /*
+ * Signals generated by perf events should not terminate the whole
+ * process if SIGTRAP is blocked, however, delivering the signal
+ * asynchronously is better than not delivering at all. But tell user
+ * space if the signal was asynchronous, so it can clearly be
+ * distinguished from normal synchronous ones.
+ */
+ info.si_perf_flags = sigismember(&current->blocked, info.si_signo) ?
+ TRAP_PERF_FLAG_ASYNC :
+ 0;
+
+ return send_sig_info(info.si_signo, &info, current);
}
/**
@@ -2229,14 +2200,17 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
* That makes it a way to test a stopped process for
* being ptrace-stopped vs being job-control-stopped.
*
- * If we actually decide not to stop at all because the tracer
- * is gone, we keep current->exit_code unless clear_code.
+ * Returns the signal the ptracer requested the code resume
+ * with. If the code did not stop because the tracer is gone,
+ * the stop signal remains unchanged unless clear_code.
*/
-static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info)
+static int ptrace_stop(int exit_code, int why, int clear_code,
+ unsigned long message, kernel_siginfo_t *info)
__releases(&current->sighand->siglock)
__acquires(&current->sighand->siglock)
{
bool gstop_done = false;
+ bool read_code = true;
if (arch_ptrace_stop_needed()) {
/*
@@ -2278,6 +2252,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
*/
smp_wmb();
+ current->ptrace_message = message;
current->last_siginfo = info;
current->exit_code = exit_code;
@@ -2344,8 +2319,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
/* tasklist protects us from ptrace_freeze_traced() */
__set_current_state(TASK_RUNNING);
+ read_code = false;
if (clear_code)
- current->exit_code = 0;
+ exit_code = 0;
read_unlock(&tasklist_lock);
}
@@ -2355,7 +2331,11 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
* any signal-sending on another CPU that wants to examine it.
*/
spin_lock_irq(&current->sighand->siglock);
+ if (read_code)
+ exit_code = current->exit_code;
current->last_siginfo = NULL;
+ current->ptrace_message = 0;
+ current->exit_code = 0;
/* LISTENING can be set only during STOP traps, clear it */
current->jobctl &= ~JOBCTL_LISTENING;
@@ -2366,9 +2346,10 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
* This sets TIF_SIGPENDING, but never clears it.
*/
recalc_sigpending_tsk(current);
+ return exit_code;
}
-static void ptrace_do_notify(int signr, int exit_code, int why)
+static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
{
kernel_siginfo_t info;
@@ -2379,18 +2360,21 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
/* Let the debugger run. */
- ptrace_stop(exit_code, why, 1, &info);
+ return ptrace_stop(exit_code, why, 1, message, &info);
}
-void ptrace_notify(int exit_code)
+int ptrace_notify(int exit_code, unsigned long message)
{
+ int signr;
+
BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
- if (unlikely(current->task_works))
+ if (unlikely(task_work_pending(current)))
task_work_run();
spin_lock_irq(&current->sighand->siglock);
- ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
+ signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
spin_unlock_irq(&current->sighand->siglock);
+ return signr;
}
/**
@@ -2545,11 +2529,10 @@ static void do_jobctl_trap(void)
signr = SIGTRAP;
WARN_ON_ONCE(!signr);
ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
- CLD_STOPPED);
+ CLD_STOPPED, 0);
} else {
WARN_ON_ONCE(!signr);
- ptrace_stop(signr, CLD_STOPPED, 0, NULL);
- current->exit_code = 0;
+ ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL);
}
}
@@ -2602,15 +2585,12 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
* comment in dequeue_signal().
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
- ptrace_stop(signr, CLD_TRAPPED, 0, info);
+ signr = ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
/* We're back. Did the debugger cancel the sig? */
- signr = current->exit_code;
if (signr == 0)
return signr;
- current->exit_code = 0;
-
/*
* Update the siginfo structure if the signal has
* changed. If the debugger wanted something
@@ -2667,20 +2647,12 @@ bool get_signal(struct ksignal *ksig)
struct signal_struct *signal = current->signal;
int signr;
- if (unlikely(current->task_works))
+ clear_notify_signal();
+ if (unlikely(task_work_pending(current)))
task_work_run();
- /*
- * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
- * that the arch handlers don't all have to do it. If we get here
- * without TIF_SIGPENDING, just exit after running signal work.
- */
- if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
- if (test_thread_flag(TIF_NOTIFY_SIGNAL))
- tracehook_notify_signal();
- if (!task_sigpending(current))
- return false;
- }
+ if (!task_sigpending(current))
+ return false;
if (unlikely(uprobe_deny_signal()))
return false;
@@ -2939,7 +2911,8 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
set_current_blocked(&blocked);
if (current->sas_ss_flags & SS_AUTODISARM)
sas_ss_reset(current);
- tracehook_signal_handler(stepping);
+ if (stepping)
+ ptrace_notify(SIGTRAP, 0);
}
void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
@@ -3470,6 +3443,7 @@ void copy_siginfo_to_external32(struct compat_siginfo *to,
to->si_addr = ptr_to_compat(from->si_addr);
to->si_perf_data = from->si_perf_data;
to->si_perf_type = from->si_perf_type;
+ to->si_perf_flags = from->si_perf_flags;
break;
case SIL_CHLD:
to->si_pid = from->si_pid;
@@ -3547,6 +3521,7 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
to->si_addr = compat_ptr(from->si_addr);
to->si_perf_data = from->si_perf_data;
to->si_perf_type = from->si_perf_type;
+ to->si_perf_flags = from->si_perf_flags;
break;
case SIL_CHLD:
to->si_pid = from->si_pid;
@@ -4760,6 +4735,7 @@ static inline void siginfo_buildtime_checks(void)
CHECK_OFFSET(si_pkey);
CHECK_OFFSET(si_perf_data);
CHECK_OFFSET(si_perf_type);
+ CHECK_OFFSET(si_perf_flags);
/* sigpoll */
CHECK_OFFSET(si_band);
diff --git a/kernel/smp.c b/kernel/smp.c
index 01a7c1706a58..dd215f439426 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -96,7 +96,7 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
-static void flush_smp_call_function_queue(bool warn_cpu_offline);
+static void __flush_smp_call_function_queue(bool warn_cpu_offline);
int smpcfd_prepare_cpu(unsigned int cpu)
{
@@ -141,7 +141,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
* ensure that the outgoing CPU doesn't go offline with work
* still pending.
*/
- flush_smp_call_function_queue(false);
+ __flush_smp_call_function_queue(false);
irq_work_run();
return 0;
}
@@ -183,7 +183,9 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
static DEFINE_PER_CPU(void *, cur_csd_info);
static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local);
-#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC)
+static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */
+module_param(csd_lock_timeout, ulong, 0444);
+
static atomic_t csd_bug_count = ATOMIC_INIT(0);
static u64 cfd_seq;
@@ -329,6 +331,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
u64 ts2, ts_delta;
call_single_data_t *cpu_cur_csd;
unsigned int flags = READ_ONCE(csd->node.u_flags);
+ unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;
if (!(flags & CSD_FLAG_LOCK)) {
if (!unlikely(*bug_id))
@@ -341,7 +344,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
ts2 = sched_clock();
ts_delta = ts2 - *ts1;
- if (likely(ts_delta <= CSD_LOCK_TIMEOUT))
+ if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
return false;
firsttime = !*bug_id;
@@ -541,11 +544,11 @@ void generic_smp_call_function_single_interrupt(void)
{
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_GOTIPI);
- flush_smp_call_function_queue(true);
+ __flush_smp_call_function_queue(true);
}
/**
- * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
*
* @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
* offline CPU. Skip this check if set to 'false'.
@@ -558,7 +561,7 @@ void generic_smp_call_function_single_interrupt(void)
* Loop through the call_single_queue and run all the queued callbacks.
* Must be called with interrupts disabled.
*/
-static void flush_smp_call_function_queue(bool warn_cpu_offline)
+static void __flush_smp_call_function_queue(bool warn_cpu_offline)
{
call_single_data_t *csd, *csd_next;
struct llist_node *entry, *prev;
@@ -579,7 +582,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
/* There shouldn't be any pending callbacks on an offline CPU. */
if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
- !warned && !llist_empty(head))) {
+ !warned && entry != NULL)) {
warned = true;
WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
@@ -681,8 +684,22 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
smp_processor_id(), CFD_SEQ_HDLEND);
}
-void flush_smp_call_function_from_idle(void)
+
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ * from task context (idle, migration thread)
+ *
+ * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
+ * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
+ * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
+ * handle queued SMP function calls before scheduling.
+ *
+ * The migration thread has to ensure that an eventually pending wakeup has
+ * been handled before it migrates a task.
+ */
+void flush_smp_call_function_queue(void)
{
+ unsigned int was_pending;
unsigned long flags;
if (llist_empty(this_cpu_ptr(&call_single_queue)))
@@ -691,9 +708,11 @@ void flush_smp_call_function_from_idle(void)
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_IDLE);
local_irq_save(flags);
- flush_smp_call_function_queue(true);
+ /* Get the already pending soft interrupts for RT enabled kernels */
+ was_pending = local_softirq_pending();
+ __flush_smp_call_function_queue(true);
if (local_softirq_pending())
- do_softirq();
+ do_softirq_post_smp_call_flush(was_pending);
local_irq_restore(flags);
}
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f6bc0bc8a2aa..b9f54544e749 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -392,6 +392,13 @@ int cpu_check_up_prepare(int cpu)
*/
return -EAGAIN;
+ case CPU_UP_PREPARE:
+ /*
+ * Timeout while waiting for the CPU to show up. Allow to try
+ * again later.
+ */
+ return 0;
+
default:
/* Should not happen. Famous last words. */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fac801815554..9f0aef8aa9ff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -294,6 +294,19 @@ static inline void invoke_softirq(void)
wakeup_softirqd();
}
+/*
+ * flush_smp_call_function_queue() can raise a soft interrupt in a function
+ * call. On RT kernels this is undesired and the only known functionality
+ * in the block layer which does this is disabled on RT. If soft interrupts
+ * get raised which haven't been raised before the flush, warn so it can be
+ * investigated.
+ */
+void do_softirq_post_smp_call_flush(unsigned int was_pending)
+{
+ if (WARN_ON_ONCE(was_pending != local_softirq_pending()))
+ invoke_softirq();
+}
+
#else /* CONFIG_PREEMPT_RT */
/*
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index ddb5a7f48d69..c2c33d2202e9 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -70,59 +70,81 @@ late_initcall(stackleak_sysctls_init);
#define skip_erasing() false
#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
-asmlinkage void noinstr stackleak_erase(void)
+static __always_inline void __stackleak_erase(bool on_task_stack)
{
- /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
- unsigned long kstack_ptr = current->lowest_stack;
- unsigned long boundary = (unsigned long)end_of_stack(current);
- unsigned int poison_count = 0;
- const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
-
- if (skip_erasing())
- return;
-
- /* Check that 'lowest_stack' value is sane */
- if (unlikely(kstack_ptr - boundary >= THREAD_SIZE))
- kstack_ptr = boundary;
+ const unsigned long task_stack_low = stackleak_task_low_bound(current);
+ const unsigned long task_stack_high = stackleak_task_high_bound(current);
+ unsigned long erase_low, erase_high;
- /* Search for the poison value in the kernel stack */
- while (kstack_ptr > boundary && poison_count <= depth) {
- if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON)
- poison_count++;
- else
- poison_count = 0;
-
- kstack_ptr -= sizeof(unsigned long);
- }
-
- /*
- * One 'long int' at the bottom of the thread stack is reserved and
- * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y).
- */
- if (kstack_ptr == boundary)
- kstack_ptr += sizeof(unsigned long);
+ erase_low = stackleak_find_top_of_poison(task_stack_low,
+ current->lowest_stack);
#ifdef CONFIG_STACKLEAK_METRICS
- current->prev_lowest_stack = kstack_ptr;
+ current->prev_lowest_stack = erase_low;
#endif
/*
- * Now write the poison value to the kernel stack. Start from
- * 'kstack_ptr' and move up till the new 'boundary'. We assume that
- * the stack pointer doesn't change when we write poison.
+ * Write poison to the task's stack between 'erase_low' and
+ * 'erase_high'.
+ *
+ * If we're running on a different stack (e.g. an entry trampoline
+ * stack) we can erase everything below the pt_regs at the top of the
+ * task stack.
+ *
+ * If we're running on the task stack itself, we must not clobber any
+ * stack used by this function and its caller. We assume that this
+ * function has a fixed-size stack frame, and the current stack pointer
+ * doesn't change while we write poison.
*/
- if (on_thread_stack())
- boundary = current_stack_pointer;
+ if (on_task_stack)
+ erase_high = current_stack_pointer;
else
- boundary = current_top_of_stack();
+ erase_high = task_stack_high;
- while (kstack_ptr < boundary) {
- *(unsigned long *)kstack_ptr = STACKLEAK_POISON;
- kstack_ptr += sizeof(unsigned long);
+ while (erase_low < erase_high) {
+ *(unsigned long *)erase_low = STACKLEAK_POISON;
+ erase_low += sizeof(unsigned long);
}
/* Reset the 'lowest_stack' value for the next syscall */
- current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
+ current->lowest_stack = task_stack_high;
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can be called from the task stack or an entry stack when the task stack is
+ * no longer in use.
+ */
+asmlinkage void noinstr stackleak_erase(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(on_thread_stack());
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_on_task_stack(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(true);
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from a stack other than the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_off_task_stack(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(false);
}
void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
@@ -139,8 +161,7 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
/* 'lowest_stack' should be aligned on the register width boundary */
sp = ALIGN(sp, sizeof(unsigned long));
if (sp < current->lowest_stack &&
- sp >= (unsigned long)task_stack_page(current) +
- sizeof(unsigned long)) {
+ sp >= stackleak_task_low_bound(current)) {
current->lowest_stack = sp;
}
}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9c625257023d..9ed5ce989415 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -226,15 +226,12 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
.store = store,
.size = size,
};
- mm_segment_t fs;
/* Trace user stack if not a kernel thread */
if (current->flags & PF_KTHREAD)
return 0;
- fs = force_uaccess_begin();
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
- force_uaccess_end(fs);
return c.len;
}
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 43ba0b1e0edb..e9c3e69f3837 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -1,548 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/init.h>
#include <linux/static_call.h>
-#include <linux/bug.h>
-#include <linux/smp.h>
-#include <linux/sort.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/processor.h>
-#include <asm/sections.h>
-
-extern struct static_call_site __start_static_call_sites[],
- __stop_static_call_sites[];
-extern struct static_call_tramp_key __start_static_call_tramp_key[],
- __stop_static_call_tramp_key[];
-
-static bool static_call_initialized;
-
-/* mutex to protect key modules/sites */
-static DEFINE_MUTEX(static_call_mutex);
-
-static void static_call_lock(void)
-{
- mutex_lock(&static_call_mutex);
-}
-
-static void static_call_unlock(void)
-{
- mutex_unlock(&static_call_mutex);
-}
-
-static inline void *static_call_addr(struct static_call_site *site)
-{
- return (void *)((long)site->addr + (long)&site->addr);
-}
-
-static inline unsigned long __static_call_key(const struct static_call_site *site)
-{
- return (long)site->key + (long)&site->key;
-}
-
-static inline struct static_call_key *static_call_key(const struct static_call_site *site)
-{
- return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
-}
-
-/* These assume the key is word-aligned. */
-static inline bool static_call_is_init(struct static_call_site *site)
-{
- return __static_call_key(site) & STATIC_CALL_SITE_INIT;
-}
-
-static inline bool static_call_is_tail(struct static_call_site *site)
-{
- return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
-}
-
-static inline void static_call_set_init(struct static_call_site *site)
-{
- site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
- (long)&site->key;
-}
-
-static int static_call_site_cmp(const void *_a, const void *_b)
-{
- const struct static_call_site *a = _a;
- const struct static_call_site *b = _b;
- const struct static_call_key *key_a = static_call_key(a);
- const struct static_call_key *key_b = static_call_key(b);
-
- if (key_a < key_b)
- return -1;
-
- if (key_a > key_b)
- return 1;
-
- return 0;
-}
-
-static void static_call_site_swap(void *_a, void *_b, int size)
-{
- long delta = (unsigned long)_a - (unsigned long)_b;
- struct static_call_site *a = _a;
- struct static_call_site *b = _b;
- struct static_call_site tmp = *a;
-
- a->addr = b->addr - delta;
- a->key = b->key - delta;
-
- b->addr = tmp.addr + delta;
- b->key = tmp.key + delta;
-}
-
-static inline void static_call_sort_entries(struct static_call_site *start,
- struct static_call_site *stop)
-{
- sort(start, stop - start, sizeof(struct static_call_site),
- static_call_site_cmp, static_call_site_swap);
-}
-
-static inline bool static_call_key_has_mods(struct static_call_key *key)
-{
- return !(key->type & 1);
-}
-
-static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
-{
- if (!static_call_key_has_mods(key))
- return NULL;
-
- return key->mods;
-}
-
-static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
-{
- if (static_call_key_has_mods(key))
- return NULL;
-
- return (struct static_call_site *)(key->type & ~1);
-}
-
-void __static_call_update(struct static_call_key *key, void *tramp, void *func)
-{
- struct static_call_site *site, *stop;
- struct static_call_mod *site_mod, first;
-
- cpus_read_lock();
- static_call_lock();
-
- if (key->func == func)
- goto done;
-
- key->func = func;
-
- arch_static_call_transform(NULL, tramp, func, false);
-
- /*
- * If uninitialized, we'll not update the callsites, but they still
- * point to the trampoline and we just patched that.
- */
- if (WARN_ON_ONCE(!static_call_initialized))
- goto done;
-
- first = (struct static_call_mod){
- .next = static_call_key_next(key),
- .mod = NULL,
- .sites = static_call_key_sites(key),
- };
-
- for (site_mod = &first; site_mod; site_mod = site_mod->next) {
- bool init = system_state < SYSTEM_RUNNING;
- struct module *mod = site_mod->mod;
-
- if (!site_mod->sites) {
- /*
- * This can happen if the static call key is defined in
- * a module which doesn't use it.
- *
- * It also happens in the has_mods case, where the
- * 'first' entry has no sites associated with it.
- */
- continue;
- }
-
- stop = __stop_static_call_sites;
-
- if (mod) {
-#ifdef CONFIG_MODULES
- stop = mod->static_call_sites +
- mod->num_static_call_sites;
- init = mod->state == MODULE_STATE_COMING;
-#endif
- }
-
- for (site = site_mod->sites;
- site < stop && static_call_key(site) == key; site++) {
- void *site_addr = static_call_addr(site);
-
- if (!init && static_call_is_init(site))
- continue;
-
- if (!kernel_text_address((unsigned long)site_addr)) {
- /*
- * This skips patching built-in __exit, which
- * is part of init_section_contains() but is
- * not part of kernel_text_address().
- *
- * Skipping built-in __exit is fine since it
- * will never be executed.
- */
- WARN_ONCE(!static_call_is_init(site),
- "can't patch static call site at %pS",
- site_addr);
- continue;
- }
-
- arch_static_call_transform(site_addr, NULL, func,
- static_call_is_tail(site));
- }
- }
-
-done:
- static_call_unlock();
- cpus_read_unlock();
-}
-EXPORT_SYMBOL_GPL(__static_call_update);
-
-static int __static_call_init(struct module *mod,
- struct static_call_site *start,
- struct static_call_site *stop)
-{
- struct static_call_site *site;
- struct static_call_key *key, *prev_key = NULL;
- struct static_call_mod *site_mod;
-
- if (start == stop)
- return 0;
-
- static_call_sort_entries(start, stop);
-
- for (site = start; site < stop; site++) {
- void *site_addr = static_call_addr(site);
-
- if ((mod && within_module_init((unsigned long)site_addr, mod)) ||
- (!mod && init_section_contains(site_addr, 1)))
- static_call_set_init(site);
-
- key = static_call_key(site);
- if (key != prev_key) {
- prev_key = key;
-
- /*
- * For vmlinux (!mod) avoid the allocation by storing
- * the sites pointer in the key itself. Also see
- * __static_call_update()'s @first.
- *
- * This allows architectures (eg. x86) to call
- * static_call_init() before memory allocation works.
- */
- if (!mod) {
- key->sites = site;
- key->type |= 1;
- goto do_transform;
- }
-
- site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
- if (!site_mod)
- return -ENOMEM;
-
- /*
- * When the key has a direct sites pointer, extract
- * that into an explicit struct static_call_mod, so we
- * can have a list of modules.
- */
- if (static_call_key_sites(key)) {
- site_mod->mod = NULL;
- site_mod->next = NULL;
- site_mod->sites = static_call_key_sites(key);
-
- key->mods = site_mod;
-
- site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
- if (!site_mod)
- return -ENOMEM;
- }
-
- site_mod->mod = mod;
- site_mod->sites = site;
- site_mod->next = static_call_key_next(key);
- key->mods = site_mod;
- }
-
-do_transform:
- arch_static_call_transform(site_addr, NULL, key->func,
- static_call_is_tail(site));
- }
-
- return 0;
-}
-
-static int addr_conflict(struct static_call_site *site, void *start, void *end)
-{
- unsigned long addr = (unsigned long)static_call_addr(site);
-
- if (addr <= (unsigned long)end &&
- addr + CALL_INSN_SIZE > (unsigned long)start)
- return 1;
-
- return 0;
-}
-
-static int __static_call_text_reserved(struct static_call_site *iter_start,
- struct static_call_site *iter_stop,
- void *start, void *end, bool init)
-{
- struct static_call_site *iter = iter_start;
-
- while (iter < iter_stop) {
- if (init || !static_call_is_init(iter)) {
- if (addr_conflict(iter, start, end))
- return 1;
- }
- iter++;
- }
-
- return 0;
-}
-
-#ifdef CONFIG_MODULES
-
-static int __static_call_mod_text_reserved(void *start, void *end)
-{
- struct module *mod;
- int ret;
-
- preempt_disable();
- mod = __module_text_address((unsigned long)start);
- WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
- if (!try_module_get(mod))
- mod = NULL;
- preempt_enable();
-
- if (!mod)
- return 0;
-
- ret = __static_call_text_reserved(mod->static_call_sites,
- mod->static_call_sites + mod->num_static_call_sites,
- start, end, mod->state == MODULE_STATE_COMING);
-
- module_put(mod);
-
- return ret;
-}
-
-static unsigned long tramp_key_lookup(unsigned long addr)
-{
- struct static_call_tramp_key *start = __start_static_call_tramp_key;
- struct static_call_tramp_key *stop = __stop_static_call_tramp_key;
- struct static_call_tramp_key *tramp_key;
-
- for (tramp_key = start; tramp_key != stop; tramp_key++) {
- unsigned long tramp;
-
- tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp;
- if (tramp == addr)
- return (long)tramp_key->key + (long)&tramp_key->key;
- }
-
- return 0;
-}
-
-static int static_call_add_module(struct module *mod)
-{
- struct static_call_site *start = mod->static_call_sites;
- struct static_call_site *stop = start + mod->num_static_call_sites;
- struct static_call_site *site;
-
- for (site = start; site != stop; site++) {
- unsigned long s_key = __static_call_key(site);
- unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
- unsigned long key;
-
- /*
- * Is the key is exported, 'addr' points to the key, which
- * means modules are allowed to call static_call_update() on
- * it.
- *
- * Otherwise, the key isn't exported, and 'addr' points to the
- * trampoline so we need to lookup the key.
- *
- * We go through this dance to prevent crazy modules from
- * abusing sensitive static calls.
- */
- if (!kernel_text_address(addr))
- continue;
-
- key = tramp_key_lookup(addr);
- if (!key) {
- pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n",
- static_call_addr(site));
- return -EINVAL;
- }
-
- key |= s_key & STATIC_CALL_SITE_FLAGS;
- site->key = key - (long)&site->key;
- }
-
- return __static_call_init(mod, start, stop);
-}
-
-static void static_call_del_module(struct module *mod)
-{
- struct static_call_site *start = mod->static_call_sites;
- struct static_call_site *stop = mod->static_call_sites +
- mod->num_static_call_sites;
- struct static_call_key *key, *prev_key = NULL;
- struct static_call_mod *site_mod, **prev;
- struct static_call_site *site;
-
- for (site = start; site < stop; site++) {
- key = static_call_key(site);
- if (key == prev_key)
- continue;
-
- prev_key = key;
-
- for (prev = &key->mods, site_mod = key->mods;
- site_mod && site_mod->mod != mod;
- prev = &site_mod->next, site_mod = site_mod->next)
- ;
-
- if (!site_mod)
- continue;
-
- *prev = site_mod->next;
- kfree(site_mod);
- }
-}
-
-static int static_call_module_notify(struct notifier_block *nb,
- unsigned long val, void *data)
-{
- struct module *mod = data;
- int ret = 0;
-
- cpus_read_lock();
- static_call_lock();
-
- switch (val) {
- case MODULE_STATE_COMING:
- ret = static_call_add_module(mod);
- if (ret) {
- WARN(1, "Failed to allocate memory for static calls");
- static_call_del_module(mod);
- }
- break;
- case MODULE_STATE_GOING:
- static_call_del_module(mod);
- break;
- }
-
- static_call_unlock();
- cpus_read_unlock();
-
- return notifier_from_errno(ret);
-}
-
-static struct notifier_block static_call_module_nb = {
- .notifier_call = static_call_module_notify,
-};
-
-#else
-
-static inline int __static_call_mod_text_reserved(void *start, void *end)
-{
- return 0;
-}
-
-#endif /* CONFIG_MODULES */
-
-int static_call_text_reserved(void *start, void *end)
-{
- bool init = system_state < SYSTEM_RUNNING;
- int ret = __static_call_text_reserved(__start_static_call_sites,
- __stop_static_call_sites, start, end, init);
-
- if (ret)
- return ret;
-
- return __static_call_mod_text_reserved(start, end);
-}
-
-int __init static_call_init(void)
-{
- int ret;
-
- if (static_call_initialized)
- return 0;
-
- cpus_read_lock();
- static_call_lock();
- ret = __static_call_init(NULL, __start_static_call_sites,
- __stop_static_call_sites);
- static_call_unlock();
- cpus_read_unlock();
-
- if (ret) {
- pr_err("Failed to allocate memory for static_call!\n");
- BUG();
- }
-
- static_call_initialized = true;
-
-#ifdef CONFIG_MODULES
- register_module_notifier(&static_call_module_nb);
-#endif
- return 0;
-}
-early_initcall(static_call_init);
long __static_call_return0(void)
{
return 0;
}
-
-#ifdef CONFIG_STATIC_CALL_SELFTEST
-
-static int func_a(int x)
-{
- return x+1;
-}
-
-static int func_b(int x)
-{
- return x+2;
-}
-
-DEFINE_STATIC_CALL(sc_selftest, func_a);
-
-static struct static_call_data {
- int (*func)(int);
- int val;
- int expect;
-} static_call_data [] __initdata = {
- { NULL, 2, 3 },
- { func_b, 2, 4 },
- { func_a, 2, 3 }
-};
-
-static int __init test_static_call_init(void)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) {
- struct static_call_data *scd = &static_call_data[i];
-
- if (scd->func)
- static_call_update(sc_selftest, scd->func);
-
- WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect);
- }
-
- return 0;
-}
-early_initcall(test_static_call_init);
-
-#endif /* CONFIG_STATIC_CALL_SELFTEST */
+EXPORT_SYMBOL_GPL(__static_call_return0);
diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c
new file mode 100644
index 000000000000..dc5665b62814
--- /dev/null
+++ b/kernel/static_call_inline.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/static_call.h>
+#include <linux/bug.h>
+#include <linux/smp.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/processor.h>
+#include <asm/sections.h>
+
+extern struct static_call_site __start_static_call_sites[],
+ __stop_static_call_sites[];
+extern struct static_call_tramp_key __start_static_call_tramp_key[],
+ __stop_static_call_tramp_key[];
+
+static bool static_call_initialized;
+
+/* mutex to protect key modules/sites */
+static DEFINE_MUTEX(static_call_mutex);
+
+static void static_call_lock(void)
+{
+ mutex_lock(&static_call_mutex);
+}
+
+static void static_call_unlock(void)
+{
+ mutex_unlock(&static_call_mutex);
+}
+
+static inline void *static_call_addr(struct static_call_site *site)
+{
+ return (void *)((long)site->addr + (long)&site->addr);
+}
+
+static inline unsigned long __static_call_key(const struct static_call_site *site)
+{
+ return (long)site->key + (long)&site->key;
+}
+
+static inline struct static_call_key *static_call_key(const struct static_call_site *site)
+{
+ return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
+}
+
+/* These assume the key is word-aligned. */
+static inline bool static_call_is_init(struct static_call_site *site)
+{
+ return __static_call_key(site) & STATIC_CALL_SITE_INIT;
+}
+
+static inline bool static_call_is_tail(struct static_call_site *site)
+{
+ return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
+}
+
+static inline void static_call_set_init(struct static_call_site *site)
+{
+ site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
+ (long)&site->key;
+}
+
+static int static_call_site_cmp(const void *_a, const void *_b)
+{
+ const struct static_call_site *a = _a;
+ const struct static_call_site *b = _b;
+ const struct static_call_key *key_a = static_call_key(a);
+ const struct static_call_key *key_b = static_call_key(b);
+
+ if (key_a < key_b)
+ return -1;
+
+ if (key_a > key_b)
+ return 1;
+
+ return 0;
+}
+
+static void static_call_site_swap(void *_a, void *_b, int size)
+{
+ long delta = (unsigned long)_a - (unsigned long)_b;
+ struct static_call_site *a = _a;
+ struct static_call_site *b = _b;
+ struct static_call_site tmp = *a;
+
+ a->addr = b->addr - delta;
+ a->key = b->key - delta;
+
+ b->addr = tmp.addr + delta;
+ b->key = tmp.key + delta;
+}
+
+static inline void static_call_sort_entries(struct static_call_site *start,
+ struct static_call_site *stop)
+{
+ sort(start, stop - start, sizeof(struct static_call_site),
+ static_call_site_cmp, static_call_site_swap);
+}
+
+static inline bool static_call_key_has_mods(struct static_call_key *key)
+{
+ return !(key->type & 1);
+}
+
+static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
+{
+ if (!static_call_key_has_mods(key))
+ return NULL;
+
+ return key->mods;
+}
+
+static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
+{
+ if (static_call_key_has_mods(key))
+ return NULL;
+
+ return (struct static_call_site *)(key->type & ~1);
+}
+
+void __static_call_update(struct static_call_key *key, void *tramp, void *func)
+{
+ struct static_call_site *site, *stop;
+ struct static_call_mod *site_mod, first;
+
+ cpus_read_lock();
+ static_call_lock();
+
+ if (key->func == func)
+ goto done;
+
+ key->func = func;
+
+ arch_static_call_transform(NULL, tramp, func, false);
+
+ /*
+ * If uninitialized, we'll not update the callsites, but they still
+ * point to the trampoline and we just patched that.
+ */
+ if (WARN_ON_ONCE(!static_call_initialized))
+ goto done;
+
+ first = (struct static_call_mod){
+ .next = static_call_key_next(key),
+ .mod = NULL,
+ .sites = static_call_key_sites(key),
+ };
+
+ for (site_mod = &first; site_mod; site_mod = site_mod->next) {
+ bool init = system_state < SYSTEM_RUNNING;
+ struct module *mod = site_mod->mod;
+
+ if (!site_mod->sites) {
+ /*
+ * This can happen if the static call key is defined in
+ * a module which doesn't use it.
+ *
+ * It also happens in the has_mods case, where the
+ * 'first' entry has no sites associated with it.
+ */
+ continue;
+ }
+
+ stop = __stop_static_call_sites;
+
+ if (mod) {
+#ifdef CONFIG_MODULES
+ stop = mod->static_call_sites +
+ mod->num_static_call_sites;
+ init = mod->state == MODULE_STATE_COMING;
+#endif
+ }
+
+ for (site = site_mod->sites;
+ site < stop && static_call_key(site) == key; site++) {
+ void *site_addr = static_call_addr(site);
+
+ if (!init && static_call_is_init(site))
+ continue;
+
+ if (!kernel_text_address((unsigned long)site_addr)) {
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
+ WARN_ONCE(!static_call_is_init(site),
+ "can't patch static call site at %pS",
+ site_addr);
+ continue;
+ }
+
+ arch_static_call_transform(site_addr, NULL, func,
+ static_call_is_tail(site));
+ }
+ }
+
+done:
+ static_call_unlock();
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(__static_call_update);
+
+static int __static_call_init(struct module *mod,
+ struct static_call_site *start,
+ struct static_call_site *stop)
+{
+ struct static_call_site *site;
+ struct static_call_key *key, *prev_key = NULL;
+ struct static_call_mod *site_mod;
+
+ if (start == stop)
+ return 0;
+
+ static_call_sort_entries(start, stop);
+
+ for (site = start; site < stop; site++) {
+ void *site_addr = static_call_addr(site);
+
+ if ((mod && within_module_init((unsigned long)site_addr, mod)) ||
+ (!mod && init_section_contains(site_addr, 1)))
+ static_call_set_init(site);
+
+ key = static_call_key(site);
+ if (key != prev_key) {
+ prev_key = key;
+
+ /*
+ * For vmlinux (!mod) avoid the allocation by storing
+ * the sites pointer in the key itself. Also see
+ * __static_call_update()'s @first.
+ *
+ * This allows architectures (eg. x86) to call
+ * static_call_init() before memory allocation works.
+ */
+ if (!mod) {
+ key->sites = site;
+ key->type |= 1;
+ goto do_transform;
+ }
+
+ site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+ if (!site_mod)
+ return -ENOMEM;
+
+ /*
+ * When the key has a direct sites pointer, extract
+ * that into an explicit struct static_call_mod, so we
+ * can have a list of modules.
+ */
+ if (static_call_key_sites(key)) {
+ site_mod->mod = NULL;
+ site_mod->next = NULL;
+ site_mod->sites = static_call_key_sites(key);
+
+ key->mods = site_mod;
+
+ site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+ if (!site_mod)
+ return -ENOMEM;
+ }
+
+ site_mod->mod = mod;
+ site_mod->sites = site;
+ site_mod->next = static_call_key_next(key);
+ key->mods = site_mod;
+ }
+
+do_transform:
+ arch_static_call_transform(site_addr, NULL, key->func,
+ static_call_is_tail(site));
+ }
+
+ return 0;
+}
+
+static int addr_conflict(struct static_call_site *site, void *start, void *end)
+{
+ unsigned long addr = (unsigned long)static_call_addr(site);
+
+ if (addr <= (unsigned long)end &&
+ addr + CALL_INSN_SIZE > (unsigned long)start)
+ return 1;
+
+ return 0;
+}
+
+static int __static_call_text_reserved(struct static_call_site *iter_start,
+ struct static_call_site *iter_stop,
+ void *start, void *end, bool init)
+{
+ struct static_call_site *iter = iter_start;
+
+ while (iter < iter_stop) {
+ if (init || !static_call_is_init(iter)) {
+ if (addr_conflict(iter, start, end))
+ return 1;
+ }
+ iter++;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_MODULES
+
+static int __static_call_mod_text_reserved(void *start, void *end)
+{
+ struct module *mod;
+ int ret;
+
+ preempt_disable();
+ mod = __module_text_address((unsigned long)start);
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ if (!try_module_get(mod))
+ mod = NULL;
+ preempt_enable();
+
+ if (!mod)
+ return 0;
+
+ ret = __static_call_text_reserved(mod->static_call_sites,
+ mod->static_call_sites + mod->num_static_call_sites,
+ start, end, mod->state == MODULE_STATE_COMING);
+
+ module_put(mod);
+
+ return ret;
+}
+
+static unsigned long tramp_key_lookup(unsigned long addr)
+{
+ struct static_call_tramp_key *start = __start_static_call_tramp_key;
+ struct static_call_tramp_key *stop = __stop_static_call_tramp_key;
+ struct static_call_tramp_key *tramp_key;
+
+ for (tramp_key = start; tramp_key != stop; tramp_key++) {
+ unsigned long tramp;
+
+ tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp;
+ if (tramp == addr)
+ return (long)tramp_key->key + (long)&tramp_key->key;
+ }
+
+ return 0;
+}
+
+static int static_call_add_module(struct module *mod)
+{
+ struct static_call_site *start = mod->static_call_sites;
+ struct static_call_site *stop = start + mod->num_static_call_sites;
+ struct static_call_site *site;
+
+ for (site = start; site != stop; site++) {
+ unsigned long s_key = __static_call_key(site);
+ unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
+ unsigned long key;
+
+ /*
+ * Is the key is exported, 'addr' points to the key, which
+ * means modules are allowed to call static_call_update() on
+ * it.
+ *
+ * Otherwise, the key isn't exported, and 'addr' points to the
+ * trampoline so we need to lookup the key.
+ *
+ * We go through this dance to prevent crazy modules from
+ * abusing sensitive static calls.
+ */
+ if (!kernel_text_address(addr))
+ continue;
+
+ key = tramp_key_lookup(addr);
+ if (!key) {
+ pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n",
+ static_call_addr(site));
+ return -EINVAL;
+ }
+
+ key |= s_key & STATIC_CALL_SITE_FLAGS;
+ site->key = key - (long)&site->key;
+ }
+
+ return __static_call_init(mod, start, stop);
+}
+
+static void static_call_del_module(struct module *mod)
+{
+ struct static_call_site *start = mod->static_call_sites;
+ struct static_call_site *stop = mod->static_call_sites +
+ mod->num_static_call_sites;
+ struct static_call_key *key, *prev_key = NULL;
+ struct static_call_mod *site_mod, **prev;
+ struct static_call_site *site;
+
+ for (site = start; site < stop; site++) {
+ key = static_call_key(site);
+ if (key == prev_key)
+ continue;
+
+ prev_key = key;
+
+ for (prev = &key->mods, site_mod = key->mods;
+ site_mod && site_mod->mod != mod;
+ prev = &site_mod->next, site_mod = site_mod->next)
+ ;
+
+ if (!site_mod)
+ continue;
+
+ *prev = site_mod->next;
+ kfree(site_mod);
+ }
+}
+
+static int static_call_module_notify(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+ int ret = 0;
+
+ cpus_read_lock();
+ static_call_lock();
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ret = static_call_add_module(mod);
+ if (ret) {
+ WARN(1, "Failed to allocate memory for static calls");
+ static_call_del_module(mod);
+ }
+ break;
+ case MODULE_STATE_GOING:
+ static_call_del_module(mod);
+ break;
+ }
+
+ static_call_unlock();
+ cpus_read_unlock();
+
+ return notifier_from_errno(ret);
+}
+
+static struct notifier_block static_call_module_nb = {
+ .notifier_call = static_call_module_notify,
+};
+
+#else
+
+static inline int __static_call_mod_text_reserved(void *start, void *end)
+{
+ return 0;
+}
+
+#endif /* CONFIG_MODULES */
+
+int static_call_text_reserved(void *start, void *end)
+{
+ bool init = system_state < SYSTEM_RUNNING;
+ int ret = __static_call_text_reserved(__start_static_call_sites,
+ __stop_static_call_sites, start, end, init);
+
+ if (ret)
+ return ret;
+
+ return __static_call_mod_text_reserved(start, end);
+}
+
+int __init static_call_init(void)
+{
+ int ret;
+
+ if (static_call_initialized)
+ return 0;
+
+ cpus_read_lock();
+ static_call_lock();
+ ret = __static_call_init(NULL, __start_static_call_sites,
+ __stop_static_call_sites);
+ static_call_unlock();
+ cpus_read_unlock();
+
+ if (ret) {
+ pr_err("Failed to allocate memory for static_call!\n");
+ BUG();
+ }
+
+ static_call_initialized = true;
+
+#ifdef CONFIG_MODULES
+ register_module_notifier(&static_call_module_nb);
+#endif
+ return 0;
+}
+early_initcall(static_call_init);
+
+#ifdef CONFIG_STATIC_CALL_SELFTEST
+
+static int func_a(int x)
+{
+ return x+1;
+}
+
+static int func_b(int x)
+{
+ return x+2;
+}
+
+DEFINE_STATIC_CALL(sc_selftest, func_a);
+
+static struct static_call_data {
+ int (*func)(int);
+ int val;
+ int expect;
+} static_call_data [] __initdata = {
+ { NULL, 2, 3 },
+ { func_b, 2, 4 },
+ { func_a, 2, 3 }
+};
+
+static int __init test_static_call_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) {
+ struct static_call_data *scd = &static_call_data[i];
+
+ if (scd->func)
+ static_call_update(sc_selftest, scd->func);
+
+ WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect);
+ }
+
+ return 0;
+}
+early_initcall(test_static_call_init);
+
+#endif /* CONFIG_STATIC_CALL_SELFTEST */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index cbc30271ea4d..cedb17ba158a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -535,8 +535,6 @@ void stop_machine_park(int cpu)
kthread_park(stopper->thread);
}
-extern void sched_set_stop_task(int cpu, struct task_struct *stop);
-
static void cpu_stop_create(unsigned int cpu)
{
sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
@@ -633,6 +631,27 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
}
EXPORT_SYMBOL_GPL(stop_machine);
+#ifdef CONFIG_SCHED_SMT
+int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+ struct multi_stop_data msdata = {
+ .fn = fn,
+ .data = data,
+ .num_threads = cpumask_weight(smt_mask),
+ .active_cpus = smt_mask,
+ };
+
+ lockdep_assert_cpus_held();
+
+ /* Set the initial state and stop all online cpus. */
+ set_state(&msdata, MULTI_STOP_PREPARE);
+ return stop_cpus(smt_mask, multi_cpu_stop, &msdata);
+}
+EXPORT_SYMBOL_GPL(stop_core_cpuslocked);
+#endif
+
/**
* stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
* @fn: the function to run
diff --git a/kernel/sys.c b/kernel/sys.c
index 5b0e172c4d47..b911fa6d81ab 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -117,6 +117,12 @@
#ifndef SVE_GET_VL
# define SVE_GET_VL() (-EINVAL)
#endif
+#ifndef SME_SET_VL
+# define SME_SET_VL(a) (-EINVAL)
+#endif
+#ifndef SME_GET_VL
+# define SME_GET_VL() (-EINVAL)
+#endif
#ifndef PAC_RESET_KEYS
# define PAC_RESET_KEYS(a, b) (-EINVAL)
#endif
@@ -1424,6 +1430,68 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
return errno;
}
+/* make sure you are allowed to change @tsk limits before calling this */
+static int do_prlimit(struct task_struct *tsk, unsigned int resource,
+ struct rlimit *new_rlim, struct rlimit *old_rlim)
+{
+ struct rlimit *rlim;
+ int retval = 0;
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+ if (new_rlim) {
+ if (new_rlim->rlim_cur > new_rlim->rlim_max)
+ return -EINVAL;
+ if (resource == RLIMIT_NOFILE &&
+ new_rlim->rlim_max > sysctl_nr_open)
+ return -EPERM;
+ }
+
+ /* Holding a refcount on tsk protects tsk->signal from disappearing. */
+ rlim = tsk->signal->rlim + resource;
+ task_lock(tsk->group_leader);
+ if (new_rlim) {
+ /*
+ * Keep the capable check against init_user_ns until cgroups can
+ * contain all limits.
+ */
+ if (new_rlim->rlim_max > rlim->rlim_max &&
+ !capable(CAP_SYS_RESOURCE))
+ retval = -EPERM;
+ if (!retval)
+ retval = security_task_setrlimit(tsk, resource, new_rlim);
+ }
+ if (!retval) {
+ if (old_rlim)
+ *old_rlim = *rlim;
+ if (new_rlim)
+ *rlim = *new_rlim;
+ }
+ task_unlock(tsk->group_leader);
+
+ /*
+ * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
+ * infinite. In case of RLIM_INFINITY the posix CPU timer code
+ * ignores the rlimit.
+ */
+ if (!retval && new_rlim && resource == RLIMIT_CPU &&
+ new_rlim->rlim_cur != RLIM_INFINITY &&
+ IS_ENABLED(CONFIG_POSIX_TIMERS)) {
+ /*
+ * update_rlimit_cpu can fail if the task is exiting, but there
+ * may be other tasks in the thread group that are not exiting,
+ * and they need their cpu timers adjusted.
+ *
+ * The group_leader is the last task to be released, so if we
+ * cannot update_rlimit_cpu on it, then the entire process is
+ * exiting and we do not need to update at all.
+ */
+ update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur);
+ }
+
+ return retval;
+}
+
SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
struct rlimit value;
@@ -1567,63 +1635,6 @@ static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
rlim->rlim_max = (unsigned long)rlim64->rlim_max;
}
-/* make sure you are allowed to change @tsk limits before calling this */
-int do_prlimit(struct task_struct *tsk, unsigned int resource,
- struct rlimit *new_rlim, struct rlimit *old_rlim)
-{
- struct rlimit *rlim;
- int retval = 0;
-
- if (resource >= RLIM_NLIMITS)
- return -EINVAL;
- if (new_rlim) {
- if (new_rlim->rlim_cur > new_rlim->rlim_max)
- return -EINVAL;
- if (resource == RLIMIT_NOFILE &&
- new_rlim->rlim_max > sysctl_nr_open)
- return -EPERM;
- }
-
- /* protect tsk->signal and tsk->sighand from disappearing */
- read_lock(&tasklist_lock);
- if (!tsk->sighand) {
- retval = -ESRCH;
- goto out;
- }
-
- rlim = tsk->signal->rlim + resource;
- task_lock(tsk->group_leader);
- if (new_rlim) {
- /* Keep the capable check against init_user_ns until
- cgroups can contain all limits */
- if (new_rlim->rlim_max > rlim->rlim_max &&
- !capable(CAP_SYS_RESOURCE))
- retval = -EPERM;
- if (!retval)
- retval = security_task_setrlimit(tsk, resource, new_rlim);
- }
- if (!retval) {
- if (old_rlim)
- *old_rlim = *rlim;
- if (new_rlim)
- *rlim = *new_rlim;
- }
- task_unlock(tsk->group_leader);
-
- /*
- * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
- * infinite. In case of RLIM_INFINITY the posix CPU timer code
- * ignores the rlimit.
- */
- if (!retval && new_rlim && resource == RLIMIT_CPU &&
- new_rlim->rlim_cur != RLIM_INFINITY &&
- IS_ENABLED(CONFIG_POSIX_TIMERS))
- update_rlimit_cpu(tsk, new_rlim->rlim_cur);
-out:
- read_unlock(&tasklist_lock);
- return retval;
-}
-
/* rcu lock must be held */
static int check_prlimit_permission(struct task_struct *task,
unsigned int flags)
@@ -2536,6 +2547,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SVE_GET_VL:
error = SVE_GET_VL();
break;
+ case PR_SME_SET_VL:
+ error = SME_SET_VL(arg2);
+ break;
+ case PR_SME_GET_VL:
+ error = SME_GET_VL();
+ break;
case PR_GET_SPECULATION_CTRL:
if (arg3 || arg4 || arg5)
return -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 830aaf8ca08e..5b7b1a82ae6a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2288,17 +2288,6 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
- {
- .procname = "timer_migration",
- .data = &sysctl_timer_migration,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = timer_migration_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
#ifdef CONFIG_BPF_SYSCALL
{
.procname = "unprivileged_bpf_disabled",
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 1698fbe6f0e1..dff75bcde151 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/task_work.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
static struct callback_head work_exited; /* all we need is ->next == NULL */
@@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* @notify: how to notify the targeted task
*
* Queue @work for task_work_run() below and notify the @task if @notify
- * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the
- * it will interrupt the targeted task and run the task_work. @TWA_RESUME
- * work is run only when the task exits the kernel and returns to user mode,
- * or before entering guest mode. Fails if the @task is exiting/exited and thus
- * it can't process this @work. Otherwise @work->func() will be called when the
- * @task goes through one of the aforementioned transitions, or exits.
+ * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
+ *
+ * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
+ * task and run the task_work, regardless of whether the task is currently
+ * running in the kernel or userspace.
+ * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
+ * reschedule IPI to force the targeted task to reschedule and run task_work.
+ * This can be advantageous if there's no strict requirement that the
+ * task_work be run as soon as possible, just whenever the task enters the
+ * kernel anyway.
+ * @TWA_RESUME work is run only when the task exits the kernel and returns to
+ * user mode, or before entering guest mode.
+ *
+ * Fails if the @task is exiting/exited and thus it can't process this @work.
+ * Otherwise @work->func() will be called when the @task goes through one of
+ * the aforementioned transitions, or exits.
*
* If the targeted task is exiting, then an error is returned and the work item
* is not queued. It's up to the caller to arrange for an alternative mechanism
@@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
case TWA_SIGNAL:
set_notify_signal(task);
break;
+ case TWA_SIGNAL_NO_IPI:
+ __set_notify_signal(task);
+ break;
default:
WARN_ON_ONCE(1);
break;
@@ -78,7 +91,7 @@ task_work_cancel_match(struct task_struct *task,
struct callback_head *work;
unsigned long flags;
- if (likely(!task->task_works))
+ if (likely(!task_work_pending(task)))
return NULL;
/*
* If cmpxchg() fails we continue without updating pprev.
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2b4898b4752e..bcac5a9043aa 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -113,13 +113,14 @@ static void send_cpu_listeners(struct sk_buff *skb,
struct listener *s, *tmp;
struct sk_buff *skb_next, *skb_cur = skb;
void *reply = genlmsg_data(genlhdr);
- int rc, delcount = 0;
+ int delcount = 0;
genlmsg_end(skb, reply);
- rc = 0;
down_read(&listeners->sem);
list_for_each_entry(s, &listeners->list, list) {
+ int rc;
+
skb_next = NULL;
if (!list_is_last(&s->list, &listeners->list)) {
skb_next = skb_clone(skb_cur, GFP_KERNEL);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 003ccf338d20..5d85014d59b5 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -690,7 +690,7 @@ static ssize_t unbind_device_store(struct device *dev,
{
char name[CS_NAME_LEN];
ssize_t ret = sysfs_get_uname(buf, name, count);
- struct clock_event_device *ce;
+ struct clock_event_device *ce = NULL, *iter;
if (ret < 0)
return ret;
@@ -698,9 +698,10 @@ static ssize_t unbind_device_store(struct device *dev,
ret = -ENODEV;
mutex_lock(&clockevents_mutex);
raw_spin_lock_irq(&clockevents_lock);
- list_for_each_entry(ce, &clockevent_devices, list) {
- if (!strcmp(ce->name, name)) {
- ret = __clockevents_try_unbind(ce, dev->id);
+ list_for_each_entry(iter, &clockevent_devices, list) {
+ if (!strcmp(iter->name, name)) {
+ ret = __clockevents_try_unbind(iter, dev->id);
+ ce = iter;
break;
}
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 95d7ca35bdf2..cee5da1e54c4 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,7 @@ void clocksource_verify_percpu(struct clocksource *cs)
cpus_read_lock();
preempt_disable();
clocksource_verify_choose_cpus();
- if (cpumask_weight(&cpus_chosen) == 0) {
+ if (cpumask_empty(&cpus_chosen)) {
preempt_enable();
cpus_read_unlock();
pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 96b4e7810426..0a97193984db 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -15,6 +15,7 @@
#include <linux/workqueue.h>
#include <linux/compat.h>
#include <linux/sched/deadline.h>
+#include <linux/task_work.h>
#include "posix-timers.h"
@@ -34,14 +35,20 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
* tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
* necessary. Needs siglock protection since other code may update the
* expiration cache as well.
+ *
+ * Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and
+ * we cannot lock_task_sighand. Cannot fail if task is current.
*/
-void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
+int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{
u64 nsecs = rlim_new * NSEC_PER_SEC;
+ unsigned long irq_fl;
- spin_lock_irq(&task->sighand->siglock);
+ if (!lock_task_sighand(task, &irq_fl))
+ return -ESRCH;
set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
- spin_unlock_irq(&task->sighand->siglock);
+ unlock_task_sighand(task, &irq_fl);
+ return 0;
}
/*
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index b1b9b12899f5..8464c5acc913 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,6 +8,7 @@
#include <linux/jiffies.h>
#include <linux/ktime.h>
#include <linux/kernel.h>
+#include <linux/math.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
@@ -199,15 +200,13 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
r = rate;
if (r >= 4000000) {
- r /= 1000000;
+ r = DIV_ROUND_CLOSEST(r, 1000000);
r_unit = 'M';
+ } else if (r >= 4000) {
+ r = DIV_ROUND_CLOSEST(r, 1000);
+ r_unit = 'k';
} else {
- if (r >= 1000) {
- r /= 1000;
- r_unit = 'k';
- } else {
- r_unit = ' ';
- }
+ r_unit = ' ';
}
/* Calculate the ns resolution of this counter */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2d76c91b85de..58a11f859ac7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -188,7 +188,7 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
*/
if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
- WARN_ON(tick_nohz_full_running);
+ WARN_ON_ONCE(tick_nohz_full_running);
#endif
tick_do_timer_cpu = cpu;
}
@@ -928,6 +928,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
if (unlikely(expires == KTIME_MAX)) {
if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
hrtimer_cancel(&ts->sched_timer);
+ else
+ tick_program_event(KTIME_MAX, 1);
return;
}
@@ -1364,9 +1366,15 @@ static void tick_nohz_handler(struct clock_event_device *dev)
tick_sched_do_timer(ts, now);
tick_sched_handle(ts, regs);
- /* No need to reprogram if we are running tickless */
- if (unlikely(ts->tick_stopped))
+ if (unlikely(ts->tick_stopped)) {
+ /*
+ * The clockevent device is not reprogrammed, so change the
+ * clock event device to ONESHOT_STOPPED to avoid spurious
+ * interrupts on devices which might not be truly one shot.
+ */
+ tick_program_event(KTIME_MAX, 1);
return;
+ }
hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
@@ -1538,7 +1546,7 @@ void tick_cancel_sched_timer(int cpu)
}
#endif
-/**
+/*
* Async notification about clocksource changes
*/
void tick_clock_notify(void)
@@ -1559,7 +1567,7 @@ void tick_oneshot_notify(void)
set_bit(0, &ts->check_clocks);
}
-/**
+/*
* Check, if a change happened, which makes oneshot possible.
*
* Called cyclic from the hrtimer softirq (driven by the timer
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dcdcb85121e4..8e4b3c32fcf9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -17,6 +17,7 @@
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
+#include <linux/timex.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
@@ -429,6 +430,14 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
memcpy(base + 1, base, sizeof(*base));
}
+static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr)
+{
+ u64 delta, cycles = tk_clock_read(tkr);
+
+ delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+ return timekeeping_delta_to_ns(tkr, delta);
+}
+
static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
struct tk_read_base *tkr;
@@ -439,12 +448,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
-
- now += timekeeping_delta_to_ns(tkr,
- clocksource_delta(
- tk_clock_read(tkr),
- tkr->cycle_last,
- tkr->mask));
+ now += fast_tk_get_delta_ns(tkr);
} while (read_seqcount_latch_retry(&tkf->seq, seq));
return now;
@@ -482,7 +486,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
* of the following timestamps. Callers need to be aware of that and
* deal with it.
*/
-u64 ktime_get_mono_fast_ns(void)
+u64 notrace ktime_get_mono_fast_ns(void)
{
return __ktime_get_fast_ns(&tk_fast_mono);
}
@@ -494,7 +498,7 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
* Contrary to ktime_get_mono_fast_ns() this is always correct because the
* conversion factor is not affected by NTP/PTP correction.
*/
-u64 ktime_get_raw_fast_ns(void)
+u64 notrace ktime_get_raw_fast_ns(void)
{
return __ktime_get_fast_ns(&tk_fast_raw);
}
@@ -528,10 +532,27 @@ u64 notrace ktime_get_boot_fast_ns(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
- return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+ return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot)));
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
+/**
+ * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock.
+ *
+ * The same limitations as described for ktime_get_boot_fast_ns() apply. The
+ * mono time and the TAI offset are not read atomically which may yield wrong
+ * readouts. However, an update of the TAI offset is an rare event e.g., caused
+ * by settime or adjtimex with an offset. The user of this function has to deal
+ * with the possibility of wrong timestamps in post processing.
+ */
+u64 notrace ktime_get_tai_fast_ns(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai)));
+}
+EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);
+
static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
{
struct tk_read_base *tkr;
@@ -543,10 +564,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
tkr = tkf->base + (seq & 0x01);
basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
-
- delta = timekeeping_delta_to_ns(tkr,
- clocksource_delta(tk_clock_read(tkr),
- tkr->cycle_last, tkr->mask));
+ delta = fast_tk_get_delta_ns(tkr);
} while (read_seqcount_latch_retry(&tkf->seq, seq));
if (mono)
@@ -2380,6 +2398,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
return 0;
}
+/**
+ * random_get_entropy_fallback - Returns the raw clock source value,
+ * used by random.c for platforms with no valid random_get_entropy().
+ */
+unsigned long random_get_entropy_fallback(void)
+{
+ struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ if (unlikely(timekeeping_suspended || !clock))
+ return 0;
+ return clock->read(clock);
+}
+EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 85f1021ad459..717fcb9fb14a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -44,6 +44,7 @@
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>
+#include <linux/sysctl.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -223,7 +224,7 @@ static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);
#ifdef CONFIG_SMP
-unsigned int sysctl_timer_migration = 1;
+static unsigned int sysctl_timer_migration = 1;
DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
@@ -234,7 +235,42 @@ static void timers_update_migration(void)
else
static_branch_disable(&timers_migration_enabled);
}
-#else
+
+#ifdef CONFIG_SYSCTL
+static int timer_migration_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ mutex_lock(&timer_keys_mutex);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write)
+ timers_update_migration();
+ mutex_unlock(&timer_keys_mutex);
+ return ret;
+}
+
+static struct ctl_table timer_sysctl[] = {
+ {
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = timer_migration_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static int __init timer_sysctl_init(void)
+{
+ register_sysctl("kernel", timer_sysctl);
+ return 0;
+}
+device_initcall(timer_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+#else /* CONFIG_SMP */
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */
@@ -251,19 +287,6 @@ void timers_update_nohz(void)
schedule_work(&timer_update_work);
}
-int timer_migration_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int ret;
-
- mutex_lock(&timer_keys_mutex);
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!ret && write)
- timers_update_migration();
- mutex_unlock(&timer_keys_mutex);
- return ret;
-}
-
static inline bool is_timers_nohz_active(void)
{
return static_branch_unlikely(&timers_nohz_active);
@@ -502,7 +525,7 @@ static inline unsigned calc_index(unsigned long expires, unsigned lvl,
*
* Round up with level granularity to prevent this.
*/
- expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+ expires = (expires >> LVL_SHIFT(lvl)) + 1;
*bucket_expiry = expires << LVL_SHIFT(lvl);
return LVL_OFFS(lvl) + (expires & LVL_MASK);
}
@@ -615,9 +638,39 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer
static const struct debug_obj_descr timer_debug_descr;
+struct timer_hint {
+ void (*function)(struct timer_list *t);
+ long offset;
+};
+
+#define TIMER_HINT(fn, container, timr, hintfn) \
+ { \
+ .function = fn, \
+ .offset = offsetof(container, hintfn) - \
+ offsetof(container, timr) \
+ }
+
+static const struct timer_hint timer_hints[] = {
+ TIMER_HINT(delayed_work_timer_fn,
+ struct delayed_work, timer, work.func),
+ TIMER_HINT(kthread_delayed_work_timer_fn,
+ struct kthread_delayed_work, timer, work.func),
+};
+
static void *timer_debug_hint(void *addr)
{
- return ((struct timer_list *) addr)->function;
+ struct timer_list *timer = addr;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
+ if (timer_hints[i].function == timer->function) {
+ void (**fn)(void) = addr + timer_hints[i].offset;
+
+ return *fn;
+ }
+ }
+
+ return timer->function;
}
static bool timer_is_static_object(void *addr)
@@ -1722,11 +1775,14 @@ static inline void __run_timers(struct timer_base *base)
time_after_eq(jiffies, base->next_expiry)) {
levels = collect_expired_timers(base, heads);
/*
- * The only possible reason for not finding any expired
- * timer at this clk is that all matching timers have been
- * dequeued.
+ * The two possible reasons for not finding any expired
+ * timer at this clk are that all matching timers have been
+ * dequeued or no timer has been queued since
+ * base::next_expiry was set to base::clk +
+ * NEXT_TIMER_MAX_DELTA.
*/
- WARN_ON_ONCE(!levels && !base->next_expiry_recalc);
+ WARN_ON_ONCE(!levels && !base->next_expiry_recalc
+ && base->timers_pending);
base->clk++;
base->next_expiry = __next_timer_interrupt(base);
@@ -1777,8 +1833,6 @@ void update_process_times(int user_tick)
{
struct task_struct *p = current;
- PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
-
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
@@ -1950,6 +2004,7 @@ int timers_prepare_cpu(unsigned int cpu)
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry_recalc = false;
base->timers_pending = false;
base->is_idle = false;
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5eb5e7fd624..debbbb083286 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
+config HAVE_RETHOOK
+ bool
+
+config RETHOOK
+ bool
+ depends on HAVE_RETHOOK
+ help
+ Enable generic return hooking feature. This is an internal
+ API, which will be used by other function-entry hooking
+ features like fprobe and kprobes.
+
config HAVE_FUNCTION_TRACER
bool
help
@@ -133,6 +144,7 @@ config TRACING
select BINARY_PRINTF
select EVENT_TRACING
select TRACE_CLOCK
+ select TASKS_RCU if PREEMPTION
config GENERIC_TRACER
bool
@@ -236,6 +248,21 @@ config DYNAMIC_FTRACE_WITH_ARGS
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+config FPROBE
+ bool "Kernel Function Probe (fprobe)"
+ depends on FUNCTION_TRACER
+ depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on HAVE_RETHOOK
+ select RETHOOK
+ default n
+ help
+ This option enables kernel function probe (fprobe) based on ftrace.
+ The fprobe is similar to kprobes, but probes only for kernel function
+ entries and exits. This also can probe multiple functions by one
+ fprobe.
+
+ If unsure, say N.
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
@@ -702,6 +729,7 @@ config FTRACE_MCOUNT_USE_OBJTOOL
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
depends on !FTRACE_MCOUNT_USE_CC
depends on FTRACE_MCOUNT_RECORD
+ select OBJTOOL
config FTRACE_MCOUNT_USE_RECORDMCOUNT
def_bool y
@@ -737,6 +765,21 @@ config SYNTH_EVENTS
If in doubt, say N.
+config USER_EVENTS
+ bool "User trace events"
+ select TRACING
+ select DYNAMIC_EVENTS
+ depends on BROKEN || COMPILE_TEST # API needs to be straighten out
+ help
+ User trace events are user-defined trace events that
+ can be used like an existing kernel trace event. User trace
+ events are generated by writing to a tracefs file. User
+ processes can determine if their tracing events should be
+ generated by memory mapping a tracefs file and checking for
+ an associated byte being non-zero.
+
+ If in doubt, say N.
+
config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index bedc5caceec7..d77cd8032213 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
+obj-$(CONFIG_USER_EVENTS) += trace_events_user.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
@@ -97,6 +98,8 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
+obj-$(CONFIG_FPROBE) += fprobe.o
+obj-$(CONFIG_RETHOOK) += rethook.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 21dea90eaa93..10a32b0f2deb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -145,13 +145,14 @@ static void trace_note_time(struct blk_trace *bt)
local_irq_restore(flags);
}
-void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
- const char *fmt, ...)
+void __blk_trace_note_message(struct blk_trace *bt,
+ struct cgroup_subsys_state *css, const char *fmt, ...)
{
int n;
va_list args;
unsigned long flags;
char *buf;
+ u64 cgid = 0;
if (unlikely(bt->trace_state != Blktrace_running &&
!blk_tracer_enabled))
@@ -170,17 +171,16 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
- if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
- blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP
- trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n,
- blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
-#else
- trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0);
+ if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+ cgid = cgroup_id(css->cgroup);
+ else
+ cgid = 1;
#endif
+ trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(__trace_note_message);
+EXPORT_SYMBOL_GPL(__blk_trace_note_message);
static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
pid_t pid)
@@ -411,7 +411,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
return PTR_ERR(msg);
bt = filp->private_data;
- __trace_note_message(bt, NULL, "%s", msg);
+ __blk_trace_note_message(bt, NULL, "%s", msg);
kfree(msg);
return count;
@@ -783,6 +783,7 @@ void blk_trace_shutdown(struct request_queue *q)
#ifdef CONFIG_BLK_CGROUP
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
+ struct cgroup_subsys_state *blkcg_css;
struct blk_trace *bt;
/* We don't use the 'bt' value here except as an optimization... */
@@ -790,9 +791,10 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return 0;
- if (!bio->bi_blkg)
+ blkcg_css = bio_blkcg_css(bio);
+ if (!blkcg_css)
return 0;
- return cgroup_id(bio_blkcg(bio)->css.cgroup);
+ return cgroup_id(blkcg_css->cgroup);
}
#else
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
@@ -1902,7 +1904,6 @@ void blk_fill_rwbs(char *rwbs, unsigned int op)
switch (op & REQ_OP_MASK) {
case REQ_OP_WRITE:
- case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
break;
case REQ_OP_DISCARD:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 21aa30644219..d8553f46caa2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,6 +17,9 @@
#include <linux/error-injection.h>
#include <linux/btf_ids.h>
#include <linux/bpf_lsm.h>
+#include <linux/fprobe.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
#include <net/bpf_sk_storage.h>
@@ -77,6 +80,8 @@ u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
u64 flags, const struct btf **btf,
s32 *btf_id);
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
/**
* trace_call_bpf - invoke BPF program
@@ -332,8 +337,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -835,8 +838,6 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
*/
if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -1036,6 +1037,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_kprobe_multi_entry_ip(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
+ .func = bpf_get_func_ip_kprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_kprobe_multi_cookie(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
+ .func = bpf_get_attach_cookie_kprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
{
struct bpf_trace_run_ctx *run_ctx;
@@ -1235,6 +1260,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+ case BPF_FUNC_copy_from_user_task:
+ return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_per_cpu_ptr:
@@ -1277,9 +1304,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
- return &bpf_get_func_ip_proto_kprobe;
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+ &bpf_get_func_ip_proto_kprobe_multi :
+ &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
- return &bpf_get_attach_cookie_proto_trace;
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+ &bpf_get_attach_cookie_proto_kmulti :
+ &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -1562,6 +1593,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
extern const struct bpf_func_proto bpf_skb_output_proto;
extern const struct bpf_func_proto bpf_xdp_output_proto;
+extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto;
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
@@ -1661,6 +1693,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_from_file_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_ptr_cookie_proto;
+ case BPF_FUNC_xdp_get_buff_len:
+ return &bpf_xdp_get_buff_len_trace_proto;
#endif
case BPF_FUNC_seq_printf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
@@ -2176,3 +2210,314 @@ static int __init bpf_event_init(void)
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
+
+#ifdef CONFIG_FPROBE
+struct bpf_kprobe_multi_link {
+ struct bpf_link link;
+ struct fprobe fp;
+ unsigned long *addrs;
+ u64 *cookies;
+ u32 cnt;
+};
+
+struct bpf_kprobe_multi_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ struct bpf_kprobe_multi_link *link;
+ unsigned long entry_ip;
+};
+
+static void bpf_kprobe_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ unregister_fprobe(&kmulti_link->fp);
+}
+
+static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ kvfree(kmulti_link->addrs);
+ kvfree(kmulti_link->cookies);
+ kfree(kmulti_link);
+}
+
+static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
+ .release = bpf_kprobe_multi_link_release,
+ .dealloc = bpf_kprobe_multi_link_dealloc,
+};
+
+static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
+{
+ const struct bpf_kprobe_multi_link *link = priv;
+ unsigned long *addr_a = a, *addr_b = b;
+ u64 *cookie_a, *cookie_b;
+ unsigned long tmp1;
+ u64 tmp2;
+
+ cookie_a = link->cookies + (addr_a - link->addrs);
+ cookie_b = link->cookies + (addr_b - link->addrs);
+
+ /* swap addr_a/addr_b and cookie_a/cookie_b values */
+ tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1;
+ tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2;
+}
+
+static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
+{
+ const unsigned long *addr_a = a, *addr_b = b;
+
+ if (*addr_a == *addr_b)
+ return 0;
+ return *addr_a < *addr_b ? -1 : 1;
+}
+
+static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
+{
+ return __bpf_kprobe_multi_cookie_cmp(a, b);
+}
+
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ struct bpf_kprobe_multi_run_ctx *run_ctx;
+ struct bpf_kprobe_multi_link *link;
+ u64 *cookie, entry_ip;
+ unsigned long *addr;
+
+ if (WARN_ON_ONCE(!ctx))
+ return 0;
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ link = run_ctx->link;
+ if (!link->cookies)
+ return 0;
+ entry_ip = run_ctx->entry_ip;
+ addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
+ __bpf_kprobe_multi_cookie_cmp);
+ if (!addr)
+ return 0;
+ cookie = link->cookies + (addr - link->addrs);
+ return *cookie;
+}
+
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ struct bpf_kprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ return run_ctx->entry_ip;
+}
+
+static int
+kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
+ unsigned long entry_ip, struct pt_regs *regs)
+{
+ struct bpf_kprobe_multi_run_ctx run_ctx = {
+ .link = link,
+ .entry_ip = entry_ip,
+ };
+ struct bpf_run_ctx *old_run_ctx;
+ int err;
+
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ err = 0;
+ goto out;
+ }
+
+ migrate_disable();
+ rcu_read_lock();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ err = bpf_prog_run(link->link.prog, regs);
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock();
+ migrate_enable();
+
+ out:
+ __this_cpu_dec(bpf_prog_active);
+ return err;
+}
+
+static void
+kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct bpf_kprobe_multi_link *link;
+
+ link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+ kprobe_multi_link_prog_run(link, entry_ip, regs);
+}
+
+static int
+kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt,
+ unsigned long *addrs)
+{
+ unsigned long addr, size;
+ const char __user **syms;
+ int err = -ENOMEM;
+ unsigned int i;
+ char *func;
+
+ size = cnt * sizeof(*syms);
+ syms = kvzalloc(size, GFP_KERNEL);
+ if (!syms)
+ return -ENOMEM;
+
+ func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL);
+ if (!func)
+ goto error;
+
+ if (copy_from_user(syms, usyms, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN);
+ if (err == KSYM_NAME_LEN)
+ err = -E2BIG;
+ if (err < 0)
+ goto error;
+ err = -EINVAL;
+ addr = kallsyms_lookup_name(func);
+ if (!addr)
+ goto error;
+ if (!kallsyms_lookup_size_offset(addr, &size, NULL))
+ goto error;
+ addr = ftrace_location_range(addr, addr + size - 1);
+ if (!addr)
+ goto error;
+ addrs[i] = addr;
+ }
+
+ err = 0;
+error:
+ kvfree(syms);
+ kfree(func);
+ return err;
+}
+
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_kprobe_multi_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ void __user *ucookies;
+ unsigned long *addrs;
+ u32 flags, cnt, size;
+ void __user *uaddrs;
+ u64 *cookies = NULL;
+ void __user *usyms;
+ int err;
+
+ /* no support for 32bit archs yet */
+ if (sizeof(u64) != sizeof(void *))
+ return -EOPNOTSUPP;
+
+ if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+
+ flags = attr->link_create.kprobe_multi.flags;
+ if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
+ return -EINVAL;
+
+ uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
+ usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
+ if (!!uaddrs == !!usyms)
+ return -EINVAL;
+
+ cnt = attr->link_create.kprobe_multi.cnt;
+ if (!cnt)
+ return -EINVAL;
+
+ size = cnt * sizeof(*addrs);
+ addrs = kvmalloc(size, GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+
+ if (uaddrs) {
+ if (copy_from_user(addrs, uaddrs, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ } else {
+ err = kprobe_multi_resolve_syms(usyms, cnt, addrs);
+ if (err)
+ goto error;
+ }
+
+ ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
+ if (ucookies) {
+ cookies = kvmalloc(size, GFP_KERNEL);
+ if (!cookies) {
+ err = -ENOMEM;
+ goto error;
+ }
+ if (copy_from_user(cookies, ucookies, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ }
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
+ &bpf_kprobe_multi_link_lops, prog);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error;
+
+ if (flags & BPF_F_KPROBE_MULTI_RETURN)
+ link->fp.exit_handler = kprobe_multi_link_handler;
+ else
+ link->fp.entry_handler = kprobe_multi_link_handler;
+
+ link->addrs = addrs;
+ link->cookies = cookies;
+ link->cnt = cnt;
+
+ if (cookies) {
+ /*
+ * Sorting addresses will trigger sorting cookies as well
+ * (check bpf_kprobe_multi_cookie_swap). This way we can
+ * find cookie based on the address in bpf_get_attach_cookie
+ * helper.
+ */
+ sort_r(addrs, cnt, sizeof(*addrs),
+ bpf_kprobe_multi_cookie_cmp,
+ bpf_kprobe_multi_cookie_swap,
+ link);
+ }
+
+ err = register_fprobe_ips(&link->fp, addrs, cnt);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+error:
+ kfree(link);
+ kvfree(addrs);
+ kvfree(cookies);
+ return err;
+}
+#else /* !CONFIG_FPROBE */
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+#endif
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 19028e072cdb..3fd5284f6487 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,6 +7,7 @@
*
* Highly modified by Steven Rostedt (VMware).
*/
+#include <linux/jump_label.h>
#include <linux/suspend.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
@@ -23,22 +24,28 @@
#define ASSIGN_OPS_HASH(opsname, val)
#endif
-static bool kill_ftrace_graph;
+DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
int ftrace_graph_active;
/* Both enabled by default (can be cleared by function_graph tracer flags */
static bool fgraph_sleep_time = true;
-/**
- * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
- *
- * ftrace_graph_stop() is called when a severe error is detected in
- * the function graph tracing. This function is called by the critical
- * paths of function graph to keep those paths from doing any more harm.
+/*
+ * archs can override this function if they must do something
+ * to enable hook for graph tracer.
+ */
+int __weak ftrace_enable_ftrace_graph_caller(void)
+{
+ return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * to disable hook for graph tracer.
*/
-bool ftrace_graph_is_dead(void)
+int __weak ftrace_disable_ftrace_graph_caller(void)
{
- return kill_ftrace_graph;
+ return 0;
}
/**
@@ -51,7 +58,7 @@ bool ftrace_graph_is_dead(void)
*/
void ftrace_graph_stop(void)
{
- kill_ftrace_graph = true;
+ static_branch_enable(&kill_ftrace_graph);
}
/* Add a function return address to the trace stack on thread info.*/
@@ -415,9 +422,9 @@ free:
static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- unsigned int prev_state,
struct task_struct *prev,
- struct task_struct *next)
+ struct task_struct *next,
+ unsigned int prev_state)
{
unsigned long long timestamp;
int index;
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
new file mode 100644
index 000000000000..89d9f994ebb0
--- /dev/null
+++ b/kernel/trace/fprobe.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fprobe - Simple ftrace probe wrapper for function entry.
+ */
+#define pr_fmt(fmt) "fprobe: " fmt
+
+#include <linux/err.h>
+#include <linux/fprobe.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+#include "trace.h"
+
+struct fprobe_rethook_node {
+ struct rethook_node node;
+ unsigned long entry_ip;
+};
+
+static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe_rethook_node *fpr;
+ struct rethook_node *rh;
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ops);
+ if (fprobe_disabled(fp))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+
+ if (fp->entry_handler)
+ fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
+
+ if (fp->exit_handler) {
+ rh = rethook_try_get(fp->rethook);
+ if (!rh) {
+ fp->nmissed++;
+ goto out;
+ }
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+ fpr->entry_ip = ip;
+ rethook_hook(rh, ftrace_get_regs(fregs), true);
+ }
+
+out:
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_handler);
+
+static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe *fp = container_of(ops, struct fprobe, ops);
+
+ if (unlikely(kprobe_running())) {
+ fp->nmissed++;
+ return;
+ }
+ kprobe_busy_begin();
+ fprobe_handler(ip, parent_ip, ops, fregs);
+ kprobe_busy_end();
+}
+
+static void fprobe_exit_handler(struct rethook_node *rh, void *data,
+ struct pt_regs *regs)
+{
+ struct fprobe *fp = (struct fprobe *)data;
+ struct fprobe_rethook_node *fpr;
+
+ if (!fp || fprobe_disabled(fp))
+ return;
+
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+
+ fp->exit_handler(fp, fpr->entry_ip, regs);
+}
+NOKPROBE_SYMBOL(fprobe_exit_handler);
+
+/* Convert ftrace location address from symbols */
+static unsigned long *get_ftrace_locations(const char **syms, int num)
+{
+ unsigned long addr, size;
+ unsigned long *addrs;
+ int i;
+
+ /* Convert symbols to symbol address */
+ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < num; i++) {
+ addr = kallsyms_lookup_name(syms[i]);
+ if (!addr) /* Maybe wrong symbol */
+ goto error;
+
+ /* Convert symbol address to ftrace location. */
+ if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size)
+ goto error;
+
+ addr = ftrace_location_range(addr, addr + size - 1);
+ if (!addr) /* No dynamic ftrace there. */
+ goto error;
+
+ addrs[i] = addr;
+ }
+
+ return addrs;
+
+error:
+ kfree(addrs);
+
+ return ERR_PTR(-ENOENT);
+}
+
+static void fprobe_init(struct fprobe *fp)
+{
+ fp->nmissed = 0;
+ if (fprobe_shared_with_kprobes(fp))
+ fp->ops.func = fprobe_kprobe_handler;
+ else
+ fp->ops.func = fprobe_handler;
+ fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
+}
+
+static int fprobe_init_rethook(struct fprobe *fp, int num)
+{
+ int i, size;
+
+ if (num < 0)
+ return -EINVAL;
+
+ if (!fp->exit_handler) {
+ fp->rethook = NULL;
+ return 0;
+ }
+
+ /* Initialize rethook if needed */
+ size = num * num_possible_cpus() * 2;
+ if (size < 0)
+ return -E2BIG;
+
+ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
+ for (i = 0; i < size; i++) {
+ struct fprobe_rethook_node *node;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node) {
+ rethook_free(fp->rethook);
+ fp->rethook = NULL;
+ return -ENOMEM;
+ }
+ rethook_add_node(fp->rethook, &node->node);
+ }
+ return 0;
+}
+
+static void fprobe_fail_cleanup(struct fprobe *fp)
+{
+ if (fp->rethook) {
+ /* Don't need to cleanup rethook->handler because this is not used. */
+ rethook_free(fp->rethook);
+ fp->rethook = NULL;
+ }
+ ftrace_free_filter(&fp->ops);
+}
+
+/**
+ * register_fprobe() - Register fprobe to ftrace by pattern.
+ * @fp: A fprobe data structure to be registered.
+ * @filter: A wildcard pattern of probed symbols.
+ * @notfilter: A wildcard pattern of NOT probed symbols.
+ *
+ * Register @fp to ftrace for enabling the probe on the symbols matched to @filter.
+ * If @notfilter is not NULL, the symbols matched the @notfilter are not probed.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
+{
+ struct ftrace_hash *hash;
+ unsigned char *str;
+ int ret, len;
+
+ if (!fp || !filter)
+ return -EINVAL;
+
+ fprobe_init(fp);
+
+ len = strlen(filter);
+ str = kstrdup(filter, GFP_KERNEL);
+ ret = ftrace_set_filter(&fp->ops, str, len, 0);
+ kfree(str);
+ if (ret)
+ return ret;
+
+ if (notfilter) {
+ len = strlen(notfilter);
+ str = kstrdup(notfilter, GFP_KERNEL);
+ ret = ftrace_set_notrace(&fp->ops, str, len, 0);
+ kfree(str);
+ if (ret)
+ goto out;
+ }
+
+ /* TODO:
+ * correctly calculate the total number of filtered symbols
+ * from both filter and notfilter.
+ */
+ hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
+ if (WARN_ON_ONCE(!hash))
+ goto out;
+
+ ret = fprobe_init_rethook(fp, (int)hash->count);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ops);
+
+out:
+ if (ret)
+ fprobe_fail_cleanup(fp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe);
+
+/**
+ * register_fprobe_ips() - Register fprobe to ftrace by address.
+ * @fp: A fprobe data structure to be registered.
+ * @addrs: An array of target ftrace location addresses.
+ * @num: The number of entries of @addrs.
+ *
+ * Register @fp to ftrace for enabling the probe on the address given by @addrs.
+ * The @addrs must be the addresses of ftrace location address, which may be
+ * the symbol address + arch-dependent offset.
+ * If you unsure what this mean, please use other registration functions.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
+{
+ int ret;
+
+ if (!fp || !addrs || num <= 0)
+ return -EINVAL;
+
+ fprobe_init(fp);
+
+ ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ ret = fprobe_init_rethook(fp, num);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ops);
+
+ if (ret)
+ fprobe_fail_cleanup(fp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_ips);
+
+/**
+ * register_fprobe_syms() - Register fprobe to ftrace by symbols.
+ * @fp: A fprobe data structure to be registered.
+ * @syms: An array of target symbols.
+ * @num: The number of entries of @syms.
+ *
+ * Register @fp to the symbols given by @syms array. This will be useful if
+ * you are sure the symbols exist in the kernel.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_syms(struct fprobe *fp, const char **syms, int num)
+{
+ unsigned long *addrs;
+ int ret;
+
+ if (!fp || !syms || num <= 0)
+ return -EINVAL;
+
+ addrs = get_ftrace_locations(syms, num);
+ if (IS_ERR(addrs))
+ return PTR_ERR(addrs);
+
+ ret = register_fprobe_ips(fp, addrs, num);
+
+ kfree(addrs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_syms);
+
+/**
+ * unregister_fprobe() - Unregister fprobe from ftrace
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret;
+
+ if (!fp || fp->ops.func != fprobe_handler)
+ return -EINVAL;
+
+ /*
+ * rethook_free() starts disabling the rethook, but the rethook handlers
+ * may be running on other processors at this point. To make sure that all
+ * current running handlers are finished, call unregister_ftrace_function()
+ * after this.
+ */
+ if (fp->rethook)
+ rethook_free(fp->rethook);
+
+ ret = unregister_ftrace_function(&fp->ops);
+ if (ret < 0)
+ return ret;
+
+ ftrace_free_filter(&fp->ops);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8b568f57cc24..af899b058c8d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1568,17 +1568,34 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end)
}
/**
- * ftrace_location - return true if the ip giving is a traced location
+ * ftrace_location - return the ftrace location
* @ip: the instruction pointer to check
*
- * Returns rec->ip if @ip given is a pointer to a ftrace location.
- * That is, the instruction that is either a NOP or call to
- * the function tracer. It checks the ftrace internal tables to
- * determine if the address belongs or not.
+ * If @ip matches the ftrace location, return @ip.
+ * If @ip matches sym+0, return sym's ftrace location.
+ * Otherwise, return 0.
*/
unsigned long ftrace_location(unsigned long ip)
{
- return ftrace_location_range(ip, ip);
+ struct dyn_ftrace *rec;
+ unsigned long offset;
+ unsigned long size;
+
+ rec = lookup_rec(ip, ip);
+ if (!rec) {
+ if (!kallsyms_lookup_size_offset(ip, &size, &offset))
+ goto out;
+
+ /* map sym+0 to __fentry__ */
+ if (!offset)
+ rec = lookup_rec(ip, ip + size - 1);
+ }
+
+ if (rec)
+ return rec->ip;
+
+out:
+ return 0;
}
/**
@@ -4958,11 +4975,12 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
}
static int
-ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
{
struct ftrace_func_entry *entry;
- if (!ftrace_location(ip))
+ ip = ftrace_location(ip);
+ if (!ip)
return -EINVAL;
if (remove) {
@@ -4977,8 +4995,29 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
}
static int
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
+ unsigned int cnt, int remove)
+{
+ unsigned int i;
+ int err;
+
+ for (i = 0; i < cnt; i++) {
+ err = __ftrace_match_addr(hash, ips[i], remove);
+ if (err) {
+ /*
+ * This expects the @hash is a temporary hash and if this
+ * fails the caller must free the @hash.
+ */
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
- unsigned long ip, int remove, int reset, int enable)
+ unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -5008,8 +5047,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
ret = -EINVAL;
goto out_regex_unlock;
}
- if (ip) {
- ret = ftrace_match_addr(hash, ip, remove);
+ if (ips) {
+ ret = ftrace_match_addr(hash, ips, cnt, remove);
if (ret < 0)
goto out_regex_unlock;
}
@@ -5026,10 +5065,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
}
static int
-ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
- int reset, int enable)
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
- return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
@@ -5110,11 +5149,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
struct ftrace_func_entry *entry;
struct ftrace_hash *free_hash = NULL;
struct dyn_ftrace *rec;
- int ret = -EBUSY;
+ int ret = -ENODEV;
mutex_lock(&direct_mutex);
+ ip = ftrace_location(ip);
+ if (!ip)
+ goto out_unlock;
+
/* See if there's a direct function at @ip already */
+ ret = -EBUSY;
if (ftrace_find_rec_direct(ip))
goto out_unlock;
@@ -5222,6 +5266,10 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
mutex_lock(&direct_mutex);
+ ip = ftrace_location(ip);
+ if (!ip)
+ goto out_unlock;
+
entry = find_direct_entry(&ip, NULL);
if (!entry)
goto out_unlock;
@@ -5354,6 +5402,11 @@ int modify_ftrace_direct(unsigned long ip,
mutex_lock(&direct_mutex);
mutex_lock(&ftrace_lock);
+
+ ip = ftrace_location(ip);
+ if (!ip)
+ goto out_unlock;
+
entry = find_direct_entry(&ip, &rec);
if (!entry)
goto out_unlock;
@@ -5634,11 +5687,30 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
{
ftrace_ops_init(ops);
- return ftrace_set_addr(ops, ip, remove, reset, 1);
+ return ftrace_set_addr(ops, &ip, 1, remove, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
/**
+ * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses
+ * @ops - the ops to set the filter with
+ * @ips - the array of addresses to add to or remove from the filter.
+ * @cnt - the number of addresses in @ips
+ * @remove - non zero to remove ips from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ips array or any ip specified within is NULL , it fails to update filter.
+ */
+int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
+ unsigned int cnt, int remove, int reset)
+{
+ ftrace_ops_init(ops);
+ return ftrace_set_addr(ops, ips, cnt, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ips);
+
+/**
* ftrace_ops_set_global_filter - setup ops to use global filters
* @ops - the ops which will use the global filters
*
@@ -5659,7 +5731,7 @@ static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
{
- return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+ return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
}
/**
@@ -7096,6 +7168,8 @@ void __init ftrace_free_init_mem(void)
void *start = (void *)(&__init_begin);
void *end = (void *)(&__init_end);
+ ftrace_boot_snapshot();
+
ftrace_free_mem(NULL, start, end);
}
@@ -7346,9 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
- unsigned int prev_state,
struct task_struct *prev,
- struct task_struct *next)
+ struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
new file mode 100644
index 000000000000..b56833700d23
--- /dev/null
+++ b/kernel/trace/rethook.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "rethook: " fmt
+
+#include <linux/bug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/preempt.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+/* Return hook list (shadow stack by list) */
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void rethook_flush_task(struct task_struct *tk)
+{
+ struct rethook_node *rhn;
+ struct llist_node *node;
+
+ node = __llist_del_all(&tk->rethooks);
+ while (node) {
+ rhn = container_of(node, struct rethook_node, llist);
+ node = node->next;
+ preempt_disable();
+ rethook_recycle(rhn);
+ preempt_enable();
+ }
+}
+
+static void rethook_free_rcu(struct rcu_head *head)
+{
+ struct rethook *rh = container_of(head, struct rethook, rcu);
+ struct rethook_node *rhn;
+ struct freelist_node *node;
+ int count = 1;
+
+ node = rh->pool.head;
+ while (node) {
+ rhn = container_of(node, struct rethook_node, freelist);
+ node = node->next;
+ kfree(rhn);
+ count++;
+ }
+
+ /* The rh->ref is the number of pooled node + 1 */
+ if (refcount_sub_and_test(count, &rh->ref))
+ kfree(rh);
+}
+
+/**
+ * rethook_free() - Free struct rethook.
+ * @rh: the struct rethook to be freed.
+ *
+ * Free the rethook. Before calling this function, user must ensure the
+ * @rh::data is cleaned if needed (or, the handler can access it after
+ * calling this function.) This function will set the @rh to be freed
+ * after all rethook_node are freed (not soon). And the caller must
+ * not touch @rh after calling this.
+ */
+void rethook_free(struct rethook *rh)
+{
+ WRITE_ONCE(rh->handler, NULL);
+
+ call_rcu(&rh->rcu, rethook_free_rcu);
+}
+
+/**
+ * rethook_alloc() - Allocate struct rethook.
+ * @data: a data to pass the @handler when hooking the return.
+ * @handler: the return hook callback function.
+ *
+ * Allocate and initialize a new rethook with @data and @handler.
+ * Return NULL if memory allocation fails or @handler is NULL.
+ * Note that @handler == NULL means this rethook is going to be freed.
+ */
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
+{
+ struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+
+ if (!rh || !handler)
+ return NULL;
+
+ rh->data = data;
+ rh->handler = handler;
+ rh->pool.head = NULL;
+ refcount_set(&rh->ref, 1);
+
+ return rh;
+}
+
+/**
+ * rethook_add_node() - Add a new node to the rethook.
+ * @rh: the struct rethook.
+ * @node: the struct rethook_node to be added.
+ *
+ * Add @node to @rh. User must allocate @node (as a part of user's
+ * data structure.) The @node fields are initialized in this function.
+ */
+void rethook_add_node(struct rethook *rh, struct rethook_node *node)
+{
+ node->rethook = rh;
+ freelist_add(&node->freelist, &rh->pool);
+ refcount_inc(&rh->ref);
+}
+
+static void free_rethook_node_rcu(struct rcu_head *head)
+{
+ struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+
+ if (refcount_dec_and_test(&node->rethook->ref))
+ kfree(node->rethook);
+ kfree(node);
+}
+
+/**
+ * rethook_recycle() - return the node to rethook.
+ * @node: The struct rethook_node to be returned.
+ *
+ * Return back the @node to @node::rethook. If the @node::rethook is already
+ * marked as freed, this will free the @node.
+ */
+void rethook_recycle(struct rethook_node *node)
+{
+ lockdep_assert_preemption_disabled();
+
+ if (likely(READ_ONCE(node->rethook->handler)))
+ freelist_add(&node->freelist, &node->rethook->pool);
+ else
+ call_rcu(&node->rcu, free_rethook_node_rcu);
+}
+NOKPROBE_SYMBOL(rethook_recycle);
+
+/**
+ * rethook_try_get() - get an unused rethook node.
+ * @rh: The struct rethook which pools the nodes.
+ *
+ * Get an unused rethook node from @rh. If the node pool is empty, this
+ * will return NULL. Caller must disable preemption.
+ */
+struct rethook_node *rethook_try_get(struct rethook *rh)
+{
+ rethook_handler_t handler = READ_ONCE(rh->handler);
+ struct freelist_node *fn;
+
+ lockdep_assert_preemption_disabled();
+
+ /* Check whether @rh is going to be freed. */
+ if (unlikely(!handler))
+ return NULL;
+
+ fn = freelist_try_get(&rh->pool);
+ if (!fn)
+ return NULL;
+
+ return container_of(fn, struct rethook_node, freelist);
+}
+NOKPROBE_SYMBOL(rethook_try_get);
+
+/**
+ * rethook_hook() - Hook the current function return.
+ * @node: The struct rethook node to hook the function return.
+ * @regs: The struct pt_regs for the function entry.
+ * @mcount: True if this is called from mcount(ftrace) context.
+ *
+ * Hook the current running function return. This must be called when the
+ * function entry (or at least @regs must be the registers of the function
+ * entry.) @mcount is used for identifying the context. If this is called
+ * from ftrace (mcount) callback, @mcount must be set true. If this is called
+ * from the real function entry (e.g. kprobes) @mcount must be set false.
+ * This is because the way to hook the function return depends on the context.
+ */
+void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount)
+{
+ arch_rethook_prepare(node, regs, mcount);
+ __llist_add(&node->llist, &current->rethooks);
+}
+NOKPROBE_SYMBOL(rethook_hook);
+
+/* This assumes the 'tsk' is the current task or is not running. */
+static unsigned long __rethook_find_ret_addr(struct task_struct *tsk,
+ struct llist_node **cur)
+{
+ struct rethook_node *rh = NULL;
+ struct llist_node *node = *cur;
+
+ if (!node)
+ node = tsk->rethooks.first;
+ else
+ node = node->next;
+
+ while (node) {
+ rh = container_of(node, struct rethook_node, llist);
+ if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) {
+ *cur = node;
+ return rh->ret_addr;
+ }
+ node = node->next;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(__rethook_find_ret_addr);
+
+/**
+ * rethook_find_ret_addr -- Find correct return address modified by rethook
+ * @tsk: Target task
+ * @frame: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a rethook on @tsk in unsigned
+ * long type.
+ * The @tsk must be 'current' or a task which is not running. @frame is a hint
+ * to get the currect return address - which is compared with the
+ * rethook::frame field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ *
+ * Returns found address value or zero if not found.
+ */
+unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
+ struct llist_node **cur)
+{
+ struct rethook_node *rhn = NULL;
+ unsigned long ret;
+
+ if (WARN_ON_ONCE(!cur))
+ return 0;
+
+ if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
+ return 0;
+
+ do {
+ ret = __rethook_find_ret_addr(tsk, cur);
+ if (!ret)
+ break;
+ rhn = container_of(*cur, struct rethook_node, llist);
+ } while (rhn->frame != frame);
+
+ return ret;
+}
+NOKPROBE_SYMBOL(rethook_find_ret_addr);
+
+void __weak arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ /*
+ * Do nothing by default. If the architecture which uses a
+ * frame pointer to record real return address on the stack,
+ * it should fill this function to fixup the return address
+ * so that stacktrace works from the rethook handler.
+ */
+}
+
+/* This function will be called from each arch-defined trampoline. */
+unsigned long rethook_trampoline_handler(struct pt_regs *regs,
+ unsigned long frame)
+{
+ struct llist_node *first, *node = NULL;
+ unsigned long correct_ret_addr;
+ rethook_handler_t handler;
+ struct rethook_node *rhn;
+
+ correct_ret_addr = __rethook_find_ret_addr(current, &node);
+ if (!correct_ret_addr) {
+ pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n");
+ BUG_ON(1);
+ }
+
+ instruction_pointer_set(regs, correct_ret_addr);
+
+ /*
+ * These loops must be protected from rethook_free_rcu() because those
+ * are accessing 'rhn->rethook'.
+ */
+ preempt_disable();
+
+ /*
+ * Run the handler on the shadow stack. Do not unlink the list here because
+ * stackdump inside the handlers needs to decode it.
+ */
+ first = current->rethooks.first;
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ if (WARN_ON_ONCE(rhn->frame != frame))
+ break;
+ handler = READ_ONCE(rhn->rethook->handler);
+ if (handler)
+ handler(rhn, rhn->rethook->data, regs);
+
+ if (first == node)
+ break;
+ first = first->next;
+ }
+
+ /* Fixup registers for returning to correct address. */
+ arch_rethook_fixup_return(regs, correct_ret_addr);
+
+ /* Unlink used shadow stack */
+ first = current->rethooks.first;
+ current->rethooks.first = node->next;
+ node->next = NULL;
+
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ first = first->next;
+ rethook_recycle(rhn);
+ }
+ preempt_enable();
+
+ return correct_ret_addr;
+}
+NOKPROBE_SYMBOL(rethook_trampoline_handler);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eb44418574f9..124f1897fd56 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -185,6 +185,7 @@ static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
static bool allocate_snapshot;
+static bool snapshot_at_boot;
static int __init set_cmdline_ftrace(char *str)
{
@@ -230,6 +231,15 @@ static int __init boot_alloc_snapshot(char *str)
__setup("alloc_snapshot", boot_alloc_snapshot);
+static int __init boot_snapshot(char *str)
+{
+ snapshot_at_boot = true;
+ boot_alloc_snapshot(str);
+ return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+
+
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
@@ -3663,12 +3673,17 @@ static char *trace_iter_expand_format(struct trace_iterator *iter)
}
/* Returns true if the string is safe to dereference from an event */
-static bool trace_safe_str(struct trace_iterator *iter, const char *str)
+static bool trace_safe_str(struct trace_iterator *iter, const char *str,
+ bool star, int len)
{
unsigned long addr = (unsigned long)str;
struct trace_event *trace_event;
struct trace_event_call *event;
+ /* Ignore strings with no length */
+ if (star && !len)
+ return true;
+
/* OK if part of the event data */
if ((addr >= (unsigned long)iter->ent) &&
(addr < (unsigned long)iter->ent + iter->ent_size))
@@ -3854,7 +3869,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
* instead. See samples/trace_events/trace-events-sample.h
* for reference.
*/
- if (WARN_ONCE(!trace_safe_str(iter, str),
+ if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
"fmt: '%s' current_buffer: '%s'",
fmt, show_buffer(&iter->seq))) {
int ret;
@@ -4274,17 +4289,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
entries,
total,
buf->cpu,
-#if defined(CONFIG_PREEMPT_NONE)
- "server",
-#elif defined(CONFIG_PREEMPT_VOLUNTARY)
- "desktop",
-#elif defined(CONFIG_PREEMPT)
- "preempt",
-#elif defined(CONFIG_PREEMPT_RT)
- "preempt_rt",
-#else
+ preempt_model_none() ? "server" :
+ preempt_model_voluntary() ? "desktop" :
+ preempt_model_full() ? "preempt" :
+ preempt_model_rt() ? "preempt_rt" :
"unknown",
-#endif
/* These are reserved for later use */
0, 0, 0, 0);
#ifdef CONFIG_SMP
@@ -7725,7 +7734,7 @@ const struct file_operations trace_min_max_fops = {
struct err_info {
const char **errs; /* ptr to loc-specific array of err strings */
u8 type; /* index into errs -> specific err string */
- u8 pos; /* MAX_FILTER_STR_VAL = 256 */
+ u16 pos; /* caret position */
u64 ts;
};
@@ -7733,26 +7742,52 @@ struct tracing_log_err {
struct list_head list;
struct err_info info;
char loc[TRACING_LOG_LOC_MAX]; /* err location */
- char cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+ char *cmd; /* what caused err */
};
static DEFINE_MUTEX(tracing_err_log_lock);
-static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *alloc_tracing_log_err(int len)
+{
+ struct tracing_log_err *err;
+
+ err = kzalloc(sizeof(*err), GFP_KERNEL);
+ if (!err)
+ return ERR_PTR(-ENOMEM);
+
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd) {
+ kfree(err);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return err;
+}
+
+static void free_tracing_log_err(struct tracing_log_err *err)
+{
+ kfree(err->cmd);
+ kfree(err);
+}
+
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
+ int len)
{
struct tracing_log_err *err;
if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
- err = kzalloc(sizeof(*err), GFP_KERNEL);
- if (!err)
- err = ERR_PTR(-ENOMEM);
- else
+ err = alloc_tracing_log_err(len);
+ if (PTR_ERR(err) != -ENOMEM)
tr->n_err_log_entries++;
return err;
}
err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
+ kfree(err->cmd);
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd)
+ return ERR_PTR(-ENOMEM);
list_del(&err->list);
return err;
@@ -7813,22 +7848,25 @@ unsigned int err_pos(char *cmd, const char *str)
*/
void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos)
+ const char **errs, u8 type, u16 pos)
{
struct tracing_log_err *err;
+ int len = 0;
if (!tr)
tr = &global_trace;
+ len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
+
mutex_lock(&tracing_err_log_lock);
- err = get_tracing_log_err(tr);
+ err = get_tracing_log_err(tr, len);
if (PTR_ERR(err) == -ENOMEM) {
mutex_unlock(&tracing_err_log_lock);
return;
}
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
- snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+ snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
err->info.errs = errs;
err->info.type = type;
@@ -7846,7 +7884,7 @@ static void clear_tracing_err_log(struct trace_array *tr)
mutex_lock(&tracing_err_log_lock);
list_for_each_entry_safe(err, next, &tr->err_log, list) {
list_del(&err->list);
- kfree(err);
+ free_tracing_log_err(err);
}
tr->n_err_log_entries = 0;
@@ -7874,9 +7912,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
mutex_unlock(&tracing_err_log_lock);
}
-static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+static void tracing_err_log_show_pos(struct seq_file *m, u16 pos)
{
- u8 i;
+ u16 i;
for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
seq_putc(m, ' ');
@@ -10122,6 +10160,14 @@ out:
return ret;
}
+void __init ftrace_boot_snapshot(void)
+{
+ if (snapshot_at_boot) {
+ tracing_snapshot();
+ internal_trace_puts("** Boot snapshot taken **\n");
+ }
+}
+
void __init early_trace_init(void)
{
if (tracepoint_printk) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5b09c31e077..07d990270e2a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1877,7 +1877,7 @@ extern ssize_t trace_parse_run_command(struct file *file,
extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos);
+ const char **errs, u8 type, u16 pos);
/*
* Normal trace_printk() and friends allocates special buffers
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index da69519c4905..f97de82d1342 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -40,6 +40,14 @@ static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
static bool eventdir_initialized;
+static LIST_HEAD(module_strings);
+
+struct module_string {
+ struct list_head next;
+ struct module *module;
+ char *str;
+};
+
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
static struct kmem_cache *field_cachep;
@@ -765,9 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
- unsigned int prev_state,
struct task_struct *prev,
- struct task_struct *next)
+ struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -791,9 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
- unsigned int prev_state,
struct task_struct *prev,
- struct task_struct *next)
+ struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -2643,6 +2651,76 @@ static void update_event_printk(struct trace_event_call *call,
}
}
+static void add_str_to_module(struct module *module, char *str)
+{
+ struct module_string *modstr;
+
+ modstr = kmalloc(sizeof(*modstr), GFP_KERNEL);
+
+ /*
+ * If we failed to allocate memory here, then we'll just
+ * let the str memory leak when the module is removed.
+ * If this fails to allocate, there's worse problems than
+ * a leaked string on module removal.
+ */
+ if (WARN_ON_ONCE(!modstr))
+ return;
+
+ modstr->module = module;
+ modstr->str = str;
+
+ list_add(&modstr->next, &module_strings);
+}
+
+static void update_event_fields(struct trace_event_call *call,
+ struct trace_eval_map *map)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+ char *ptr;
+ char *str;
+ int len = strlen(map->eval_string);
+
+ /* Dynamic events should never have field maps */
+ if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ return;
+
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
+ ptr = strchr(field->type, '[');
+ if (!ptr)
+ continue;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ continue;
+
+ if (strncmp(map->eval_string, ptr, len) != 0)
+ continue;
+
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
+
+ /*
+ * If the event is part of a module, then we need to free the string
+ * when the module is removed. Otherwise, it will stay allocated
+ * until a reboot.
+ */
+ if (call->module)
+ add_str_to_module(call->module, str);
+
+ field->type = str;
+ }
+}
+
void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
@@ -2678,6 +2756,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
first = false;
}
update_event_printk(call, map[i]);
+ update_event_fields(call, map[i]);
}
}
}
@@ -2768,6 +2847,7 @@ int trace_add_event_call(struct trace_event_call *call)
mutex_unlock(&trace_types_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(trace_add_event_call);
/*
* Must be called under locking of trace_types_lock, event_mutex and
@@ -2829,6 +2909,7 @@ int trace_remove_event_call(struct trace_event_call *call)
return ret;
}
+EXPORT_SYMBOL_GPL(trace_remove_event_call);
#define for_each_event(event, start, end) \
for (event = start; \
@@ -2863,6 +2944,7 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
struct trace_event_call *call, *p;
+ struct module_string *modstr, *m;
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
@@ -2871,6 +2953,14 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
+ /* Check for any strings allocade for this module */
+ list_for_each_entry_safe(modstr, m, &module_strings, next) {
+ if (modstr->module != mod)
+ continue;
+ list_del(&modstr->next);
+ kfree(modstr->str);
+ kfree(modstr);
+ }
up_write(&trace_event_sem);
/*
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index dc7f733b4cb3..44db5ba9cabb 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -727,11 +727,16 @@ static struct track_data *track_data_alloc(unsigned int key_len,
return data;
}
-static char last_cmd[MAX_FILTER_STR_VAL];
+#define HIST_PREFIX "hist:"
+
+static char *last_cmd;
static char last_cmd_loc[MAX_FILTER_STR_VAL];
static int errpos(char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -739,12 +744,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
{
const char *system = NULL, *name = NULL;
struct trace_event_call *call;
+ int len;
if (!str)
return;
- strcpy(last_cmd, "hist:");
- strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:"));
+ /* sizeof() contains the nul byte */
+ len = sizeof(HIST_PREFIX) + strlen(str);
+ kfree(last_cmd);
+ last_cmd = kzalloc(len, GFP_KERNEL);
+ if (!last_cmd)
+ return;
+
+ strcpy(last_cmd, HIST_PREFIX);
+ /* Again, sizeof() contains the nul byte */
+ len -= sizeof(HIST_PREFIX);
+ strncat(last_cmd, str, len);
if (file) {
call = file->event_call;
@@ -757,18 +772,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
}
if (system)
- snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
+ snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name);
}
-static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
+static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
err_type, err_pos);
}
static void hist_err_clear(void)
{
- last_cmd[0] = '\0';
+ if (last_cmd)
+ last_cmd[0] = '\0';
last_cmd_loc[0] = '\0';
}
@@ -5610,7 +5629,7 @@ static int event_hist_trigger_print(struct seq_file *m,
bool have_var = false;
unsigned int i;
- seq_puts(m, "hist:");
+ seq_puts(m, HIST_PREFIX);
if (data->name)
seq_printf(m, "%s:", data->name);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 154db74dadbc..5e8c07aef071 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -42,10 +42,13 @@ enum { ERRORS };
static const char *err_text[] = { ERRORS };
-static char last_cmd[MAX_FILTER_STR_VAL];
+static char *last_cmd;
static int errpos(const char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -54,11 +57,16 @@ static void last_cmd_set(const char *str)
if (!str)
return;
- strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+ kfree(last_cmd);
+
+ last_cmd = kstrdup(str, GFP_KERNEL);
}
-static void synth_err(u8 err_type, u8 err_pos)
+static void synth_err(u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
}
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
new file mode 100644
index 000000000000..706e1686b5eb
--- /dev/null
+++ b/kernel/trace/trace_events_user.c
@@ -0,0 +1,1628 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cdev.h>
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/ioctl.h>
+#include <linux/jhash.h>
+#include <linux/trace_events.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+/* Reminder to move to uapi when everything works */
+#ifdef CONFIG_COMPILE_TEST
+#include <linux/user_events.h>
+#else
+#include <uapi/linux/user_events.h>
+#endif
+#include "trace.h"
+#include "trace_dynevent.h"
+
+#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
+
+#define FIELD_DEPTH_TYPE 0
+#define FIELD_DEPTH_NAME 1
+#define FIELD_DEPTH_SIZE 2
+
+/*
+ * Limits how many trace_event calls user processes can create:
+ * Must be a power of two of PAGE_SIZE.
+ */
+#define MAX_PAGE_ORDER 0
+#define MAX_PAGES (1 << MAX_PAGE_ORDER)
+#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
+
+/* Limit how long of an event name plus args within the subsystem. */
+#define MAX_EVENT_DESC 512
+#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define MAX_FIELD_ARRAY_SIZE 1024
+#define MAX_FIELD_ARG_NAME 256
+
+static char *register_page_data;
+
+static DEFINE_MUTEX(reg_mutex);
+static DEFINE_HASHTABLE(register_table, 4);
+static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+
+/*
+ * Stores per-event properties, as users register events
+ * within a file a user_event might be created if it does not
+ * already exist. These are globally used and their lifetime
+ * is tied to the refcnt member. These cannot go away until the
+ * refcnt reaches zero.
+ */
+struct user_event {
+ struct tracepoint tracepoint;
+ struct trace_event_call call;
+ struct trace_event_class class;
+ struct dyn_event devent;
+ struct hlist_node node;
+ struct list_head fields;
+ struct list_head validators;
+ atomic_t refcnt;
+ int index;
+ int flags;
+ int min_size;
+};
+
+/*
+ * Stores per-file events references, as users register events
+ * within a file this structure is modified and freed via RCU.
+ * The lifetime of this struct is tied to the lifetime of the file.
+ * These are not shared and only accessible by the file that created it.
+ */
+struct user_event_refs {
+ struct rcu_head rcu;
+ int count;
+ struct user_event *events[];
+};
+
+#define VALIDATOR_ENSURE_NULL (1 << 0)
+#define VALIDATOR_REL (1 << 1)
+
+struct user_event_validator {
+ struct list_head link;
+ int offset;
+ int flags;
+};
+
+typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted);
+
+static int user_event_parse(char *name, char *args, char *flags,
+ struct user_event **newuser);
+
+static u32 user_event_key(char *name)
+{
+ return jhash(name, strlen(name), 0);
+}
+
+static __always_inline __must_check
+size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
+{
+ size_t ret;
+
+ pagefault_disable();
+
+ ret = copy_from_iter_nocache(addr, bytes, i);
+
+ pagefault_enable();
+
+ return ret;
+}
+
+static struct list_head *user_event_get_fields(struct trace_event_call *call)
+{
+ struct user_event *user = (struct user_event *)call->data;
+
+ return &user->fields;
+}
+
+/*
+ * Parses a register command for user_events
+ * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
+ *
+ * Example event named 'test' with a 20 char 'msg' field with an unsigned int
+ * 'id' field after:
+ * test char[20] msg;unsigned int id
+ *
+ * NOTE: Offsets are from the user data perspective, they are not from the
+ * trace_entry/buffer perspective. We automatically add the common properties
+ * sizes to the offset for the user.
+ *
+ * Upon success user_event has its ref count increased by 1.
+ */
+static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
+{
+ char *name = raw_command;
+ char *args = strpbrk(name, " ");
+ char *flags;
+
+ if (args)
+ *args++ = '\0';
+
+ flags = strpbrk(name, ":");
+
+ if (flags)
+ *flags++ = '\0';
+
+ return user_event_parse(name, args, flags, newuser);
+}
+
+static int user_field_array_size(const char *type)
+{
+ const char *start = strchr(type, '[');
+ char val[8];
+ char *bracket;
+ int size = 0;
+
+ if (start == NULL)
+ return -EINVAL;
+
+ if (strscpy(val, start + 1, sizeof(val)) <= 0)
+ return -EINVAL;
+
+ bracket = strchr(val, ']');
+
+ if (!bracket)
+ return -EINVAL;
+
+ *bracket = '\0';
+
+ if (kstrtouint(val, 0, &size))
+ return -EINVAL;
+
+ if (size > MAX_FIELD_ARRAY_SIZE)
+ return -EINVAL;
+
+ return size;
+}
+
+static int user_field_size(const char *type)
+{
+ /* long is not allowed from a user, since it's ambigious in size */
+ if (strcmp(type, "s64") == 0)
+ return sizeof(s64);
+ if (strcmp(type, "u64") == 0)
+ return sizeof(u64);
+ if (strcmp(type, "s32") == 0)
+ return sizeof(s32);
+ if (strcmp(type, "u32") == 0)
+ return sizeof(u32);
+ if (strcmp(type, "int") == 0)
+ return sizeof(int);
+ if (strcmp(type, "unsigned int") == 0)
+ return sizeof(unsigned int);
+ if (strcmp(type, "s16") == 0)
+ return sizeof(s16);
+ if (strcmp(type, "u16") == 0)
+ return sizeof(u16);
+ if (strcmp(type, "short") == 0)
+ return sizeof(short);
+ if (strcmp(type, "unsigned short") == 0)
+ return sizeof(unsigned short);
+ if (strcmp(type, "s8") == 0)
+ return sizeof(s8);
+ if (strcmp(type, "u8") == 0)
+ return sizeof(u8);
+ if (strcmp(type, "char") == 0)
+ return sizeof(char);
+ if (strcmp(type, "unsigned char") == 0)
+ return sizeof(unsigned char);
+ if (str_has_prefix(type, "char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "unsigned char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "__data_loc "))
+ return sizeof(u32);
+ if (str_has_prefix(type, "__rel_loc "))
+ return sizeof(u32);
+
+ /* Uknown basic type, error */
+ return -EINVAL;
+}
+
+static void user_event_destroy_validators(struct user_event *user)
+{
+ struct user_event_validator *validator, *next;
+ struct list_head *head = &user->validators;
+
+ list_for_each_entry_safe(validator, next, head, link) {
+ list_del(&validator->link);
+ kfree(validator);
+ }
+}
+
+static void user_event_destroy_fields(struct user_event *user)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+
+ list_for_each_entry_safe(field, next, head, link) {
+ list_del(&field->link);
+ kfree(field);
+ }
+}
+
+static int user_event_add_field(struct user_event *user, const char *type,
+ const char *name, int offset, int size,
+ int is_signed, int filter_type)
+{
+ struct user_event_validator *validator;
+ struct ftrace_event_field *field;
+ int validator_flags = 0;
+
+ field = kmalloc(sizeof(*field), GFP_KERNEL);
+
+ if (!field)
+ return -ENOMEM;
+
+ if (str_has_prefix(type, "__data_loc "))
+ goto add_validator;
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ validator_flags |= VALIDATOR_REL;
+ goto add_validator;
+ }
+
+ goto add_field;
+
+add_validator:
+ if (strstr(type, "char") != 0)
+ validator_flags |= VALIDATOR_ENSURE_NULL;
+
+ validator = kmalloc(sizeof(*validator), GFP_KERNEL);
+
+ if (!validator) {
+ kfree(field);
+ return -ENOMEM;
+ }
+
+ validator->flags = validator_flags;
+ validator->offset = offset;
+
+ /* Want sequential access when validating */
+ list_add_tail(&validator->link, &user->validators);
+
+add_field:
+ field->type = type;
+ field->name = name;
+ field->offset = offset;
+ field->size = size;
+ field->is_signed = is_signed;
+ field->filter_type = filter_type;
+
+ list_add(&field->link, &user->fields);
+
+ /*
+ * Min size from user writes that are required, this does not include
+ * the size of trace_entry (common fields).
+ */
+ user->min_size = (offset + size) - sizeof(struct trace_entry);
+
+ return 0;
+}
+
+/*
+ * Parses the values of a field within the description
+ * Format: type name [size]
+ */
+static int user_event_parse_field(char *field, struct user_event *user,
+ u32 *offset)
+{
+ char *part, *type, *name;
+ u32 depth = 0, saved_offset = *offset;
+ int len, size = -EINVAL;
+ bool is_struct = false;
+
+ field = skip_spaces(field);
+
+ if (*field == '\0')
+ return 0;
+
+ /* Handle types that have a space within */
+ len = str_has_prefix(field, "unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "struct ");
+ if (len) {
+ is_struct = true;
+ goto skip_next;
+ }
+
+ len = str_has_prefix(field, "__data_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__data_loc ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc ");
+ if (len)
+ goto skip_next;
+
+ goto parse;
+skip_next:
+ type = field;
+ field = strpbrk(field + len, " ");
+
+ if (field == NULL)
+ return -EINVAL;
+
+ *field++ = '\0';
+ depth++;
+parse:
+ name = NULL;
+
+ while ((part = strsep(&field, " ")) != NULL) {
+ switch (depth++) {
+ case FIELD_DEPTH_TYPE:
+ type = part;
+ break;
+ case FIELD_DEPTH_NAME:
+ name = part;
+ break;
+ case FIELD_DEPTH_SIZE:
+ if (!is_struct)
+ return -EINVAL;
+
+ if (kstrtou32(part, 10, &size))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (depth < FIELD_DEPTH_SIZE || !name)
+ return -EINVAL;
+
+ if (depth == FIELD_DEPTH_SIZE)
+ size = user_field_size(type);
+
+ if (size == 0)
+ return -EINVAL;
+
+ if (size < 0)
+ return size;
+
+ *offset = saved_offset + size;
+
+ return user_event_add_field(user, type, name, saved_offset, size,
+ type[0] != 'u', FILTER_OTHER);
+}
+
+static int user_event_parse_fields(struct user_event *user, char *args)
+{
+ char *field;
+ u32 offset = sizeof(struct trace_entry);
+ int ret = -EINVAL;
+
+ if (args == NULL)
+ return 0;
+
+ while ((field = strsep(&args, ";")) != NULL) {
+ ret = user_event_parse_field(field, user, &offset);
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static struct trace_event_fields user_event_fields_array[1];
+
+static const char *user_field_format(const char *type)
+{
+ if (strcmp(type, "s64") == 0)
+ return "%lld";
+ if (strcmp(type, "u64") == 0)
+ return "%llu";
+ if (strcmp(type, "s32") == 0)
+ return "%d";
+ if (strcmp(type, "u32") == 0)
+ return "%u";
+ if (strcmp(type, "int") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned int") == 0)
+ return "%u";
+ if (strcmp(type, "s16") == 0)
+ return "%d";
+ if (strcmp(type, "u16") == 0)
+ return "%u";
+ if (strcmp(type, "short") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned short") == 0)
+ return "%u";
+ if (strcmp(type, "s8") == 0)
+ return "%d";
+ if (strcmp(type, "u8") == 0)
+ return "%u";
+ if (strcmp(type, "char") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned char") == 0)
+ return "%u";
+ if (strstr(type, "char[") != 0)
+ return "%s";
+
+ /* Unknown, likely struct, allowed treat as 64-bit */
+ return "%llu";
+}
+
+static bool user_field_is_dyn_string(const char *type, const char **str_func)
+{
+ if (str_has_prefix(type, "__data_loc ")) {
+ *str_func = "__get_str";
+ goto check;
+ }
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ *str_func = "__get_rel_str";
+ goto check;
+ }
+
+ return false;
+check:
+ return strstr(type, "char") != 0;
+}
+
+#define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int pos = 0, depth = 0;
+ const char *str_func;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth != 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s",
+ field->name, user_field_format(field->type));
+
+ depth++;
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (user_field_is_dyn_string(field->type, &str_func))
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", %s(%s)", str_func, field->name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", field->name);
+ }
+
+ return pos + 1;
+}
+#undef LEN_OR_ZERO
+
+static int user_event_create_print_fmt(struct user_event *user)
+{
+ char *print_fmt;
+ int len;
+
+ len = user_event_set_print_fmt(user, NULL, 0);
+
+ print_fmt = kmalloc(len, GFP_KERNEL);
+
+ if (!print_fmt)
+ return -ENOMEM;
+
+ user_event_set_print_fmt(user, print_fmt, len);
+
+ user->call.print_fmt = print_fmt;
+
+ return 0;
+}
+
+static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ /* Unsafe to try to decode user provided print_fmt, use hex */
+ trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
+ 1, iter->ent, iter->ent_size, true);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions user_event_funcs = {
+ .trace = user_event_print_trace,
+};
+
+static int user_event_set_call_visible(struct user_event *user, bool visible)
+{
+ int ret;
+ const struct cred *old_cred;
+ struct cred *cred;
+
+ cred = prepare_creds();
+
+ if (!cred)
+ return -ENOMEM;
+
+ /*
+ * While by default tracefs is locked down, systems can be configured
+ * to allow user_event files to be less locked down. The extreme case
+ * being "other" has read/write access to user_events_data/status.
+ *
+ * When not locked down, processes may not have have permissions to
+ * add/remove calls themselves to tracefs. We need to temporarily
+ * switch to root file permission to allow for this scenario.
+ */
+ cred->fsuid = GLOBAL_ROOT_UID;
+
+ old_cred = override_creds(cred);
+
+ if (visible)
+ ret = trace_add_event_call(&user->call);
+ else
+ ret = trace_remove_event_call(&user->call);
+
+ revert_creds(old_cred);
+ put_cred(cred);
+
+ return ret;
+}
+
+static int destroy_user_event(struct user_event *user)
+{
+ int ret = 0;
+
+ /* Must destroy fields before call removal */
+ user_event_destroy_fields(user);
+
+ ret = user_event_set_call_visible(user, false);
+
+ if (ret)
+ return ret;
+
+ dyn_event_remove(&user->devent);
+
+ register_page_data[user->index] = 0;
+ clear_bit(user->index, page_bitmap);
+ hash_del(&user->node);
+
+ user_event_destroy_validators(user);
+ kfree(user->call.print_fmt);
+ kfree(EVENT_NAME(user));
+ kfree(user);
+
+ return ret;
+}
+
+static struct user_event *find_user_event(char *name, u32 *outkey)
+{
+ struct user_event *user;
+ u32 key = user_event_key(name);
+
+ *outkey = key;
+
+ hash_for_each_possible(register_table, user, node, key)
+ if (!strcmp(EVENT_NAME(user), name)) {
+ atomic_inc(&user->refcnt);
+ return user;
+ }
+
+ return NULL;
+}
+
+static int user_event_validate(struct user_event *user, void *data, int len)
+{
+ struct list_head *head = &user->validators;
+ struct user_event_validator *validator;
+ void *pos, *end = data + len;
+ u32 loc, offset, size;
+
+ list_for_each_entry(validator, head, link) {
+ pos = data + validator->offset;
+
+ /* Already done min_size check, no bounds check here */
+ loc = *(u32 *)pos;
+ offset = loc & 0xffff;
+ size = loc >> 16;
+
+ if (likely(validator->flags & VALIDATOR_REL))
+ pos += offset + sizeof(loc);
+ else
+ pos = data + offset;
+
+ pos += size;
+
+ if (unlikely(pos > end))
+ return -EFAULT;
+
+ if (likely(validator->flags & VALIDATOR_ENSURE_NULL))
+ if (unlikely(*(char *)(pos - 1) != '\0'))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Writes the user supplied payload out to a trace file.
+ */
+static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct trace_event_file *file;
+ struct trace_entry *entry;
+ struct trace_event_buffer event_buffer;
+ size_t size = sizeof(*entry) + i->count;
+
+ file = (struct trace_event_file *)tpdata;
+
+ if (!file ||
+ !(file->flags & EVENT_FILE_FL_ENABLED) ||
+ trace_trigger_soft_disabled(file))
+ return;
+
+ /* Allocates and fills trace_entry, + 1 of this is data payload */
+ entry = trace_event_buffer_reserve(&event_buffer, file, size);
+
+ if (unlikely(!entry))
+ return;
+
+ if (unlikely(!copy_nofault(entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, entry, size)))
+ goto discard;
+
+ trace_event_buffer_commit(&event_buffer);
+
+ return;
+discard:
+ *faulted = true;
+ __trace_event_discard_commit(event_buffer.buffer,
+ event_buffer.event);
+}
+
+#ifdef CONFIG_PERF_EVENTS
+/*
+ * Writes the user supplied payload out to perf ring buffer.
+ */
+static void user_event_perf(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct hlist_head *perf_head;
+
+ perf_head = this_cpu_ptr(user->call.perf_events);
+
+ if (perf_head && !hlist_empty(perf_head)) {
+ struct trace_entry *perf_entry;
+ struct pt_regs *regs;
+ size_t size = sizeof(*perf_entry) + i->count;
+ int context;
+
+ perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
+ &regs, &context);
+
+ if (unlikely(!perf_entry))
+ return;
+
+ perf_fetch_caller_regs(regs);
+
+ if (unlikely(!copy_nofault(perf_entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, perf_entry, size)))
+ goto discard;
+
+ perf_trace_buf_submit(perf_entry, size, context,
+ user->call.event.type, 1, regs,
+ perf_head, NULL);
+
+ return;
+discard:
+ *faulted = true;
+ perf_swevent_put_recursion_context(context);
+ }
+}
+#endif
+
+/*
+ * Update the register page that is shared between user processes.
+ */
+static void update_reg_page_for(struct user_event *user)
+{
+ struct tracepoint *tp = &user->tracepoint;
+ char status = 0;
+
+ if (atomic_read(&tp->key.enabled) > 0) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ probe_func = probe_func_ptr->func;
+
+ if (probe_func == user_event_ftrace)
+ status |= EVENT_STATUS_FTRACE;
+#ifdef CONFIG_PERF_EVENTS
+ else if (probe_func == user_event_perf)
+ status |= EVENT_STATUS_PERF;
+#endif
+ else
+ status |= EVENT_STATUS_OTHER;
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+ }
+
+ register_page_data[user->index] = status;
+}
+
+/*
+ * Register callback for our events from tracing sub-systems.
+ */
+static int user_event_reg(struct trace_event_call *call,
+ enum trace_reg type,
+ void *data)
+{
+ struct user_event *user = (struct user_event *)call->data;
+ int ret = 0;
+
+ if (!user)
+ return -ENOENT;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->probe,
+ data);
+ goto dec;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->perf_probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_PERF_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->perf_probe,
+ data);
+ goto dec;
+
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ break;
+#endif
+ }
+
+ return ret;
+inc:
+ atomic_inc(&user->refcnt);
+ update_reg_page_for(user);
+ return 0;
+dec:
+ update_reg_page_for(user);
+ atomic_dec(&user->refcnt);
+ return 0;
+}
+
+static int user_event_create(const char *raw_command)
+{
+ struct user_event *user;
+ char *name;
+ int ret;
+
+ if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
+ return -ECANCELED;
+
+ raw_command += USER_EVENTS_PREFIX_LEN;
+ raw_command = skip_spaces(raw_command);
+
+ name = kstrdup(raw_command, GFP_KERNEL);
+
+ if (!name)
+ return -ENOMEM;
+
+ mutex_lock(&reg_mutex);
+
+ ret = user_event_parse_cmd(name, &user);
+
+ if (!ret)
+ atomic_dec(&user->refcnt);
+
+ mutex_unlock(&reg_mutex);
+
+ if (ret)
+ kfree(name);
+
+ return ret;
+}
+
+static int user_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ struct ftrace_event_field *field, *next;
+ struct list_head *head;
+ int depth = 0;
+
+ seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
+
+ head = trace_get_fields(&user->call);
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth == 0)
+ seq_puts(m, " ");
+ else
+ seq_puts(m, "; ");
+
+ seq_printf(m, "%s %s", field->type, field->name);
+
+ if (str_has_prefix(field->type, "struct "))
+ seq_printf(m, " %d", field->size);
+
+ depth++;
+ }
+
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static bool user_event_is_busy(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ return atomic_read(&user->refcnt) != 0;
+}
+
+static int user_event_free(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ if (atomic_read(&user->refcnt) != 0)
+ return -EBUSY;
+
+ return destroy_user_event(user);
+}
+
+static bool user_field_match(struct ftrace_event_field *field, int argc,
+ const char **argv, int *iout)
+{
+ char *field_name, *arg_name;
+ int len, pos, i = *iout;
+ bool colon = false, match = false;
+
+ if (i >= argc)
+ return false;
+
+ len = MAX_FIELD_ARG_NAME;
+ field_name = kmalloc(len, GFP_KERNEL);
+ arg_name = kmalloc(len, GFP_KERNEL);
+
+ if (!arg_name || !field_name)
+ goto out;
+
+ pos = 0;
+
+ for (; i < argc; ++i) {
+ if (i != *iout)
+ pos += snprintf(arg_name + pos, len - pos, " ");
+
+ pos += snprintf(arg_name + pos, len - pos, argv[i]);
+
+ if (strchr(argv[i], ';')) {
+ ++i;
+ colon = true;
+ break;
+ }
+ }
+
+ pos = 0;
+
+ pos += snprintf(field_name + pos, len - pos, field->type);
+ pos += snprintf(field_name + pos, len - pos, " ");
+ pos += snprintf(field_name + pos, len - pos, field->name);
+
+ if (colon)
+ pos += snprintf(field_name + pos, len - pos, ";");
+
+ *iout = i;
+
+ match = strcmp(arg_name, field_name) == 0;
+out:
+ kfree(arg_name);
+ kfree(field_name);
+
+ return match;
+}
+
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int i = 0;
+
+ list_for_each_entry_safe_reverse(field, next, head, link)
+ if (!user_field_match(field, argc, argv, &i))
+ return false;
+
+ if (i != argc)
+ return false;
+
+ return true;
+}
+
+static bool user_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ bool match;
+
+ match = strcmp(EVENT_NAME(user), event) == 0 &&
+ (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+
+ if (match && argc > 0)
+ match = user_fields_match(user, argc, argv);
+
+ return match;
+}
+
+static struct dyn_event_operations user_event_dops = {
+ .create = user_event_create,
+ .show = user_event_show,
+ .is_busy = user_event_is_busy,
+ .free = user_event_free,
+ .match = user_event_match,
+};
+
+static int user_event_trace_register(struct user_event *user)
+{
+ int ret;
+
+ ret = register_trace_event(&user->call.event);
+
+ if (!ret)
+ return -ENODEV;
+
+ ret = user_event_set_call_visible(user, true);
+
+ if (ret)
+ unregister_trace_event(&user->call.event);
+
+ return ret;
+}
+
+/*
+ * Parses the event name, arguments and flags then registers if successful.
+ * The name buffer lifetime is owned by this method for success cases only.
+ * Upon success the returned user_event has its ref count increased by 1.
+ */
+static int user_event_parse(char *name, char *args, char *flags,
+ struct user_event **newuser)
+{
+ int ret;
+ int index;
+ u32 key;
+ struct user_event *user;
+
+ /* Prevent dyn_event from racing */
+ mutex_lock(&event_mutex);
+ user = find_user_event(name, &key);
+ mutex_unlock(&event_mutex);
+
+ if (user) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
+ return 0;
+ }
+
+ index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
+
+ if (index == MAX_EVENTS)
+ return -EMFILE;
+
+ user = kzalloc(sizeof(*user), GFP_KERNEL);
+
+ if (!user)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&user->class.fields);
+ INIT_LIST_HEAD(&user->fields);
+ INIT_LIST_HEAD(&user->validators);
+
+ user->tracepoint.name = name;
+
+ ret = user_event_parse_fields(user, args);
+
+ if (ret)
+ goto put_user;
+
+ ret = user_event_create_print_fmt(user);
+
+ if (ret)
+ goto put_user;
+
+ user->call.data = user;
+ user->call.class = &user->class;
+ user->call.name = name;
+ user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
+ user->call.tp = &user->tracepoint;
+ user->call.event.funcs = &user_event_funcs;
+
+ user->class.system = USER_EVENTS_SYSTEM;
+ user->class.fields_array = user_event_fields_array;
+ user->class.get_fields = user_event_get_fields;
+ user->class.reg = user_event_reg;
+ user->class.probe = user_event_ftrace;
+#ifdef CONFIG_PERF_EVENTS
+ user->class.perf_probe = user_event_perf;
+#endif
+
+ mutex_lock(&event_mutex);
+
+ ret = user_event_trace_register(user);
+
+ if (ret)
+ goto put_user_lock;
+
+ user->index = index;
+
+ /* Ensure we track ref */
+ atomic_inc(&user->refcnt);
+
+ dyn_event_init(&user->devent, &user_event_dops);
+ dyn_event_add(&user->devent, &user->call);
+ set_bit(user->index, page_bitmap);
+ hash_add(register_table, &user->node, key);
+
+ mutex_unlock(&event_mutex);
+
+ *newuser = user;
+ return 0;
+put_user_lock:
+ mutex_unlock(&event_mutex);
+put_user:
+ user_event_destroy_fields(user);
+ user_event_destroy_validators(user);
+ kfree(user);
+ return ret;
+}
+
+/*
+ * Deletes a previously created event if it is no longer being used.
+ */
+static int delete_user_event(char *name)
+{
+ u32 key;
+ int ret;
+ struct user_event *user = find_user_event(name, &key);
+
+ if (!user)
+ return -ENOENT;
+
+ /* Ensure we are the last ref */
+ if (atomic_read(&user->refcnt) != 1) {
+ ret = -EBUSY;
+ goto put_ref;
+ }
+
+ ret = destroy_user_event(user);
+
+ if (ret)
+ goto put_ref;
+
+ return ret;
+put_ref:
+ /* No longer have this ref */
+ atomic_dec(&user->refcnt);
+
+ return ret;
+}
+
+/*
+ * Validates the user payload and writes via iterator.
+ */
+static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
+{
+ struct user_event_refs *refs;
+ struct user_event *user = NULL;
+ struct tracepoint *tp;
+ ssize_t ret = i->count;
+ int idx;
+
+ if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
+ return -EFAULT;
+
+ rcu_read_lock_sched();
+
+ refs = rcu_dereference_sched(file->private_data);
+
+ /*
+ * The refs->events array is protected by RCU, and new items may be
+ * added. But the user retrieved from indexing into the events array
+ * shall be immutable while the file is opened.
+ */
+ if (likely(refs && idx < refs->count))
+ user = refs->events[idx];
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(user == NULL))
+ return -ENOENT;
+
+ if (unlikely(i->count < user->min_size))
+ return -EINVAL;
+
+ tp = &user->tracepoint;
+
+ /*
+ * It's possible key.enabled disables after this check, however
+ * we don't mind if a few events are included in this condition.
+ */
+ if (likely(atomic_read(&tp->key.enabled) > 0)) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+ struct iov_iter copy;
+ void *tpdata;
+ bool faulted;
+
+ if (unlikely(fault_in_iov_iter_readable(i, i->count)))
+ return -EFAULT;
+
+ faulted = false;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ copy = *i;
+ probe_func = probe_func_ptr->func;
+ tpdata = probe_func_ptr->data;
+ probe_func(user, &copy, tpdata, &faulted);
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(faulted))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+static ssize_t user_events_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct iovec iov;
+ struct iov_iter i;
+
+ if (unlikely(*ppos != 0))
+ return -EFAULT;
+
+ if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
+ return -EFAULT;
+
+ return user_events_write_core(file, &i);
+}
+
+static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
+{
+ return user_events_write_core(kp->ki_filp, i);
+}
+
+static int user_events_ref_add(struct file *file, struct user_event *user)
+{
+ struct user_event_refs *refs, *new_refs;
+ int i, size, count = 0;
+
+ refs = rcu_dereference_protected(file->private_data,
+ lockdep_is_held(&reg_mutex));
+
+ if (refs) {
+ count = refs->count;
+
+ for (i = 0; i < count; ++i)
+ if (refs->events[i] == user)
+ return i;
+ }
+
+ size = struct_size(refs, events, count + 1);
+
+ new_refs = kzalloc(size, GFP_KERNEL);
+
+ if (!new_refs)
+ return -ENOMEM;
+
+ new_refs->count = count + 1;
+
+ for (i = 0; i < count; ++i)
+ new_refs->events[i] = refs->events[i];
+
+ new_refs->events[i] = user;
+
+ atomic_inc(&user->refcnt);
+
+ rcu_assign_pointer(file->private_data, new_refs);
+
+ if (refs)
+ kfree_rcu(refs, rcu);
+
+ return i;
+}
+
+static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
+{
+ u32 size;
+ long ret;
+
+ ret = get_user(size, &ureg->size);
+
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+}
+
+/*
+ * Registers a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
+{
+ struct user_reg __user *ureg = (struct user_reg __user *)uarg;
+ struct user_reg reg;
+ struct user_event *user;
+ char *name;
+ long ret;
+
+ ret = user_reg_get(ureg, &reg);
+
+ if (ret)
+ return ret;
+
+ name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
+ MAX_EVENT_DESC);
+
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ return ret;
+ }
+
+ ret = user_event_parse_cmd(name, &user);
+
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ ret = user_events_ref_add(file, user);
+
+ /* No longer need parse ref, ref_add either worked or not */
+ atomic_dec(&user->refcnt);
+
+ /* Positive number is index and valid */
+ if (ret < 0)
+ return ret;
+
+ put_user((u32)ret, &ureg->write_index);
+ put_user(user->index, &ureg->status_index);
+
+ return 0;
+}
+
+/*
+ * Deletes a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_del(struct file *file, unsigned long uarg)
+{
+ void __user *ubuf = (void __user *)uarg;
+ char *name;
+ long ret;
+
+ name = strndup_user(ubuf, MAX_EVENT_DESC);
+
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ /* event_mutex prevents dyn_event from racing */
+ mutex_lock(&event_mutex);
+ ret = delete_user_event(name);
+ mutex_unlock(&event_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+/*
+ * Handles the ioctl from user mode to register or alter operations.
+ */
+static long user_events_ioctl(struct file *file, unsigned int cmd,
+ unsigned long uarg)
+{
+ long ret = -ENOTTY;
+
+ switch (cmd) {
+ case DIAG_IOCSREG:
+ mutex_lock(&reg_mutex);
+ ret = user_events_ioctl_reg(file, uarg);
+ mutex_unlock(&reg_mutex);
+ break;
+
+ case DIAG_IOCSDEL:
+ mutex_lock(&reg_mutex);
+ ret = user_events_ioctl_del(file, uarg);
+ mutex_unlock(&reg_mutex);
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Handles the final close of the file from user mode.
+ */
+static int user_events_release(struct inode *node, struct file *file)
+{
+ struct user_event_refs *refs;
+ struct user_event *user;
+ int i;
+
+ /*
+ * Ensure refs cannot change under any situation by taking the
+ * register mutex during the final freeing of the references.
+ */
+ mutex_lock(&reg_mutex);
+
+ refs = file->private_data;
+
+ if (!refs)
+ goto out;
+
+ /*
+ * The lifetime of refs has reached an end, it's tied to this file.
+ * The underlying user_events are ref counted, and cannot be freed.
+ * After this decrement, the user_events may be freed elsewhere.
+ */
+ for (i = 0; i < refs->count; ++i) {
+ user = refs->events[i];
+
+ if (user)
+ atomic_dec(&user->refcnt);
+ }
+out:
+ file->private_data = NULL;
+
+ mutex_unlock(&reg_mutex);
+
+ kfree(refs);
+
+ return 0;
+}
+
+static const struct file_operations user_data_fops = {
+ .write = user_events_write,
+ .write_iter = user_events_write_iter,
+ .unlocked_ioctl = user_events_ioctl,
+ .release = user_events_release,
+};
+
+/*
+ * Maps the shared page into the user process for checking if event is enabled.
+ */
+static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long size = vma->vm_end - vma->vm_start;
+
+ if (size != MAX_EVENTS)
+ return -EINVAL;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ virt_to_phys(register_page_data) >> PAGE_SHIFT,
+ size, vm_get_page_prot(VM_READ));
+}
+
+static void *user_seq_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return (void *)1;
+}
+
+static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void user_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static int user_seq_show(struct seq_file *m, void *p)
+{
+ struct user_event *user;
+ char status;
+ int i, active = 0, busy = 0, flags;
+
+ mutex_lock(&reg_mutex);
+
+ hash_for_each(register_table, i, user, node) {
+ status = register_page_data[user->index];
+ flags = user->flags;
+
+ seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+
+ if (flags != 0 || status != 0)
+ seq_puts(m, " #");
+
+ if (status != 0) {
+ seq_puts(m, " Used by");
+ if (status & EVENT_STATUS_FTRACE)
+ seq_puts(m, " ftrace");
+ if (status & EVENT_STATUS_PERF)
+ seq_puts(m, " perf");
+ if (status & EVENT_STATUS_OTHER)
+ seq_puts(m, " other");
+ busy++;
+ }
+
+ seq_puts(m, "\n");
+ active++;
+ }
+
+ mutex_unlock(&reg_mutex);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Active: %d\n", active);
+ seq_printf(m, "Busy: %d\n", busy);
+ seq_printf(m, "Max: %ld\n", MAX_EVENTS);
+
+ return 0;
+}
+
+static const struct seq_operations user_seq_ops = {
+ .start = user_seq_start,
+ .next = user_seq_next,
+ .stop = user_seq_stop,
+ .show = user_seq_show,
+};
+
+static int user_status_open(struct inode *node, struct file *file)
+{
+ return seq_open(file, &user_seq_ops);
+}
+
+static const struct file_operations user_status_fops = {
+ .open = user_status_open,
+ .mmap = user_status_mmap,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * Creates a set of tracefs files to allow user mode interactions.
+ */
+static int create_user_tracefs(void)
+{
+ struct dentry *edata, *emmap;
+
+ edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
+ NULL, NULL, &user_data_fops);
+
+ if (!edata) {
+ pr_warn("Could not create tracefs 'user_events_data' entry\n");
+ goto err;
+ }
+
+ /* mmap with MAP_SHARED requires writable fd */
+ emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+ NULL, NULL, &user_status_fops);
+
+ if (!emmap) {
+ tracefs_remove(edata);
+ pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ return -ENODEV;
+}
+
+static void set_page_reservations(bool set)
+{
+ int page;
+
+ for (page = 0; page < MAX_PAGES; ++page) {
+ void *addr = register_page_data + (PAGE_SIZE * page);
+
+ if (set)
+ SetPageReserved(virt_to_page(addr));
+ else
+ ClearPageReserved(virt_to_page(addr));
+ }
+}
+
+static int __init trace_events_user_init(void)
+{
+ struct page *pages;
+ int ret;
+
+ /* Zero all bits beside 0 (which is reserved for failures) */
+ bitmap_zero(page_bitmap, MAX_EVENTS);
+ set_bit(0, page_bitmap);
+
+ pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
+ if (!pages)
+ return -ENOMEM;
+ register_page_data = page_address(pages);
+
+ set_page_reservations(true);
+
+ ret = create_user_tracefs();
+
+ if (ret) {
+ pr_warn("user_events could not register with tracefs\n");
+ set_page_reservations(false);
+ __free_pages(pages, MAX_PAGE_ORDER);
+ return ret;
+ }
+
+ if (dyn_event_register(&user_event_dops))
+ pr_warn("user_events could not register with dyn_events\n");
+
+ return 0;
+}
+
+fs_initcall(trace_events_user_init);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b62fd785b599..47cebef78532 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1433,7 +1433,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
fbuffer.regs = regs;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = (unsigned long)tk->rp.kp.addr;
- entry->ret_ip = (unsigned long)ri->ret_addr;
+ entry->ret_ip = get_kretprobe_retaddr(ri);
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
@@ -1628,7 +1628,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
return;
entry->func = (unsigned long)tk->rp.kp.addr;
- entry->ret_ip = (unsigned long)ri->ret_addr;
+ entry->ret_ip = get_kretprobe_retaddr(ri);
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index e9ae1f33a7f0..afb92e2f0aea 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1168,9 +1168,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
*/
static void
trace_sched_switch_callback(void *data, bool preempt,
- unsigned int prev_state,
struct task_struct *p,
- struct task_struct *n)
+ struct task_struct *n,
+ unsigned int prev_state)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index f4938040c228..95b58bd757ce 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -46,7 +46,7 @@ void trace_hardirqs_on(void)
this_cpu_write(tracing_irq_cpu, 0);
}
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
@@ -94,7 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
this_cpu_write(tracing_irq_cpu, 0);
}
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ lockdep_hardirqs_on_prepare();
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 993b0ed10d8c..c9ffdcfe622e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,8 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
- unsigned int prev_state,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev, struct task_struct *next,
+ unsigned int prev_state)
{
int flags;
@@ -45,7 +45,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee)
if (!flags)
return;
- tracing_record_taskinfo(current, flags);
+ tracing_record_taskinfo_sched_switch(current, wakee, flags);
}
static int tracing_sched_register(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 46429f9a96fa..330aee1c1a49 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,8 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
static void notrace
probe_wakeup_sched_switch(void *ignore, bool preempt,
- unsigned int prev_state,
- struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev, struct task_struct *next,
+ unsigned int prev_state)
{
struct trace_array_cpu *data;
u64 T0, T1, delta;
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 3990e4df3d7b..230038d4f908 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -370,6 +370,7 @@ static void __put_watch_queue(struct kref *kref)
for (i = 0; i < wqueue->nr_pages; i++)
__free_page(wqueue->notes[i]);
+ kfree(wqueue->notes);
bitmap_free(wqueue->notes_bitmap);
wfilter = rcu_access_pointer(wqueue->filter);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 52e9abbb7759..0d2514b4ff0d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -154,15 +154,20 @@ struct worker_pool {
unsigned long watchdog_ts; /* L: watchdog timestamp */
- /* The current concurrency level. */
- atomic_t nr_running;
+ /*
+ * The counter is incremented in a process context on the associated CPU
+ * w/ preemption disabled, and decremented or reset in the same context
+ * but w/ pool->lock held. The readers grab pool->lock and are
+ * guaranteed to see if the counter reached zero.
+ */
+ int nr_running;
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
int nr_idle; /* L: currently idle workers */
- struct list_head idle_list; /* X: list of idle workers */
+ struct list_head idle_list; /* L: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
struct timer_list mayday_timer; /* L: SOS timer for workers */
@@ -777,7 +782,7 @@ static bool work_is_canceling(struct work_struct *work)
static bool __need_more_worker(struct worker_pool *pool)
{
- return !atomic_read(&pool->nr_running);
+ return !pool->nr_running;
}
/*
@@ -802,8 +807,7 @@ static bool may_start_working(struct worker_pool *pool)
/* Do I need to keep working? Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
- return !list_empty(&pool->worklist) &&
- atomic_read(&pool->nr_running) <= 1;
+ return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
}
/* Do we need a new worker? Called from manager. */
@@ -826,7 +830,7 @@ static bool too_many_workers(struct worker_pool *pool)
* Wake up functions.
*/
-/* Return the first idle worker. Safe with preemption disabled */
+/* Return the first idle worker. Called with pool->lock held. */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
if (unlikely(list_empty(&pool->idle_list)))
@@ -873,7 +877,7 @@ void wq_worker_running(struct task_struct *task)
*/
preempt_disable();
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(&worker->pool->nr_running);
+ worker->pool->nr_running++;
preempt_enable();
worker->sleeping = 0;
}
@@ -887,7 +891,7 @@ void wq_worker_running(struct task_struct *task)
*/
void wq_worker_sleeping(struct task_struct *task)
{
- struct worker *next, *worker = kthread_data(task);
+ struct worker *worker = kthread_data(task);
struct worker_pool *pool;
/*
@@ -917,23 +921,9 @@ void wq_worker_sleeping(struct task_struct *task)
return;
}
- /*
- * The counterpart of the following dec_and_test, implied mb,
- * worklist not empty test sequence is in insert_work().
- * Please read comment there.
- *
- * NOT_RUNNING is clear. This means that we're bound to and
- * running on the local cpu w/ rq lock held and preemption
- * disabled, which in turn means that none else could be
- * manipulating idle_list, so dereferencing idle_list without pool
- * lock is safe.
- */
- if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist)) {
- next = first_idle_worker(pool);
- if (next)
- wake_up_process(next->task);
- }
+ pool->nr_running--;
+ if (need_more_worker(pool))
+ wake_up_worker(pool);
raw_spin_unlock_irq(&pool->lock);
}
@@ -987,7 +977,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags)
/* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- atomic_dec(&pool->nr_running);
+ pool->nr_running--;
}
worker->flags |= flags;
@@ -1019,7 +1009,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
*/
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(&pool->nr_running);
+ pool->nr_running++;
}
/**
@@ -1372,13 +1362,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
list_add_tail(&work->entry, head);
get_pwq(pwq);
- /*
- * Ensure either wq_worker_sleeping() sees the above
- * list_add_tail() or we see zero nr_running to avoid workers lying
- * around lazily while there are works to be processed.
- */
- smp_mb();
-
if (__need_more_worker(pool))
wake_up_worker(pool);
}
@@ -1827,8 +1810,7 @@ static void worker_enter_idle(struct worker *worker)
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/* Sanity check nr_running. */
- WARN_ON_ONCE(pool->nr_workers == pool->nr_idle &&
- atomic_read(&pool->nr_running));
+ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
}
/**
@@ -5006,7 +4988,7 @@ static void unbind_workers(int cpu)
* an unbound (in terms of concurrency management) pool which
* are served by workers tied to the pool.
*/
- atomic_set(&pool->nr_running, 0);
+ pool->nr_running = 0;
/*
* With concurrency management just turned off, a busy