summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arena.c25
-rw-r--r--kernel/bpf/bloom_filter.c13
-rw-r--r--kernel/bpf/helpers.c2
-rw-r--r--kernel/bpf/syscall.c35
-rw-r--r--kernel/bpf/verifier.c30
-rw-r--r--kernel/crash_reserve.c7
-rw-r--r--kernel/irq/manage.c9
-rw-r--r--kernel/kprobes.c18
-rw-r--r--kernel/module/Kconfig5
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/printk/printk.c6
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/time/posix-clock.c16
-rw-r--r--kernel/time/tick-sched.c18
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/time/timer.c22
-rw-r--r--kernel/time/timer_migration.c32
-rw-r--r--kernel/trace/bpf_trace.c10
-rw-r--r--kernel/trace/trace_probe.c2
20 files changed, 213 insertions, 54 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 368c5d86b5b7..e497011261b8 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,7 +4,7 @@ ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y)
# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details
cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
-CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
+CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 86571e760dd6..343c3456c8dd 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -38,7 +38,7 @@
/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8)
-#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ)
+#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
struct bpf_arena {
struct bpf_map map;
@@ -110,7 +110,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
return ERR_PTR(-EINVAL);
vm_range = (u64)attr->max_entries * PAGE_SIZE;
- if (vm_range > (1ull << 32))
+ if (vm_range > SZ_4G)
return ERR_PTR(-E2BIG);
if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
@@ -301,7 +301,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad
if (pgoff)
return -EINVAL;
- if (len > (1ull << 32))
+ if (len > SZ_4G)
return -E2BIG;
/* if user_vm_start was specified at arena creation time */
@@ -322,7 +322,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad
if (WARN_ON_ONCE(arena->user_vm_start))
/* checks at map creation time should prevent this */
return -EFAULT;
- return round_up(ret, 1ull << 32);
+ return round_up(ret, SZ_4G);
}
static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
@@ -346,7 +346,7 @@ static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
return -EBUSY;
/* Earlier checks should prevent this */
- if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff))
+ if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff))
return -EFAULT;
if (remember_vma(arena, vma))
@@ -420,7 +420,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (uaddr & ~PAGE_MASK)
return 0;
pgoff = compute_pgoff(arena, uaddr);
- if (pgoff + page_cnt > page_cnt_max)
+ if (pgoff > page_cnt_max - page_cnt)
/* requested address will be outside of user VMA */
return 0;
}
@@ -447,7 +447,13 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
goto out;
uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
- /* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */
+ /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
+ * will not overflow 32-bit. Lower 32-bit need to represent
+ * contiguous user address range.
+ * Map these pages at kern_vm_start base.
+ * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
+ * lower 32-bit and it's ok.
+ */
ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
if (ret) {
@@ -510,6 +516,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
if (!page)
continue;
if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
+ /* Optimization for the common case of page_cnt==1:
+ * If page wasn't mapped into some user vma there
+ * is no need to call zap_pages which is slow. When
+ * page_cnt is big it's faster to do the batched zap.
+ */
zap_pages(arena, full_uaddr, 1);
vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
__free_page(page);
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index addf3dd57b59..35e1ddca74d2 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -80,6 +80,18 @@ static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return -EOPNOTSUPP;
}
+/* Called from syscall */
+static int bloom_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
{
u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
@@ -191,6 +203,7 @@ static u64 bloom_map_mem_usage(const struct bpf_map *map)
BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
const struct bpf_map_ops bloom_filter_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bloom_map_alloc_check,
.map_alloc = bloom_map_alloc,
.map_free = bloom_map_free,
.map_get_next_key = bloom_map_get_next_key,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a89587859571..449b9a5d3fe3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2548,7 +2548,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ae2ff73bde7e..c287925471f6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3024,17 +3024,46 @@ void bpf_link_inc(struct bpf_link *link)
atomic64_inc(&link->refcnt);
}
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
+{
+ struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
+
+ /* free bpf_link and its containing memory */
+ link->ops->dealloc_deferred(link);
+}
+
+static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_link_defer_dealloc_rcu_gp(rcu);
+ else
+ call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+}
+
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
+ bool sleepable = false;
+
bpf_link_free_id(link->id);
if (link->prog) {
+ sleepable = link->prog->sleepable;
/* detach BPF program, clean up used resources */
link->ops->release(link);
bpf_prog_put(link->prog);
}
- /* free bpf_link and its containing memory */
- link->ops->dealloc(link);
+ if (link->ops->dealloc_deferred) {
+ /* schedule BPF link deallocation; if underlying BPF program
+ * is sleepable, we need to first wait for RCU tasks trace
+ * sync, then go through "classic" RCU grace period
+ */
+ if (sleepable)
+ call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ else
+ call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ }
+ if (link->ops->dealloc)
+ link->ops->dealloc(link);
}
static void bpf_link_put_deferred(struct work_struct *work)
@@ -3544,7 +3573,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.release = bpf_raw_tp_link_release,
- .dealloc = bpf_raw_tp_link_dealloc,
+ .dealloc_deferred = bpf_raw_tp_link_dealloc,
.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
.fill_link_info = bpf_raw_tp_link_fill_link_info,
};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 63749ad5ac6b..98188379d5c7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5682,6 +5682,13 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static bool is_arena_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return reg->type == PTR_TO_ARENA;
+}
+
static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#ifdef CONFIG_NET
[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
@@ -6694,6 +6701,11 @@ static int check_stack_access_within_bounds(
err = check_stack_slot_within_bounds(env, min_off, state, type);
if (!err && max_off > 0)
err = -EINVAL; /* out of stack access into non-negative offsets */
+ if (!err && access_size < 0)
+ /* access_size should not be negative (or overflow an int); others checks
+ * along the way should have prevented such an access.
+ */
+ err = -EFAULT; /* invalid negative access size; integer overflow? */
if (err) {
if (tnum_is_const(reg->var_off)) {
@@ -7019,7 +7031,8 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
if (is_ctx_reg(env, insn->dst_reg) ||
is_pkt_reg(env, insn->dst_reg) ||
is_flow_key_reg(env, insn->dst_reg) ||
- is_sk_reg(env, insn->dst_reg)) {
+ is_sk_reg(env, insn->dst_reg) ||
+ is_arena_reg(env, insn->dst_reg)) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str(env, reg_state(env, insn->dst_reg)->type));
@@ -14014,6 +14027,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
return -EINVAL;
}
+ if (!env->prog->aux->arena) {
+ verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n");
+ return -EINVAL;
+ }
} else {
if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
insn->off != 32) || insn->imm) {
@@ -14046,8 +14063,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->imm) {
/* off == BPF_ADDR_SPACE_CAST */
mark_reg_unknown(env, regs, insn->dst_reg);
- if (insn->imm == 1) /* cast from as(1) to as(0) */
+ if (insn->imm == 1) { /* cast from as(1) to as(0) */
dst_reg->type = PTR_TO_ARENA;
+ /* PTR_TO_ARENA is 32-bit */
+ dst_reg->subreg_def = env->insn_idx + 1;
+ }
} else if (insn->off == 0) {
/* case: R1 = R2
* copy register state to dest reg
@@ -18359,15 +18379,18 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
}
if (!env->prog->jit_requested) {
verbose(env, "JIT is required to use arena\n");
+ fdput(f);
return -EOPNOTSUPP;
}
if (!bpf_jit_supports_arena()) {
verbose(env, "JIT doesn't support arena\n");
+ fdput(f);
return -EOPNOTSUPP;
}
env->prog->aux->arena = (void *)map;
if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
verbose(env, "arena's user address must be set via map_extra or mmap()\n");
+ fdput(f);
return -EINVAL;
}
}
@@ -19601,8 +19624,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
(((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
/* convert to 32-bit mov that clears upper 32-bit */
insn->code = BPF_ALU | BPF_MOV | BPF_X;
- /* clear off, so it's a normal 'wX = wY' from JIT pov */
+ /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
insn->off = 0;
+ insn->imm = 0;
} /* cast from as(0) to as(1) should be handled by JIT */
goto next_insn;
}
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index bbb6c3cb00e4..066668799f75 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -366,8 +366,10 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
crashk_low_res.start = low_base;
crashk_low_res.end = low_base + low_size - 1;
+#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
insert_resource(&iomem_resource, &crashk_low_res);
#endif
+#endif
return 0;
}
@@ -448,8 +450,12 @@ retry:
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
+#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+ insert_resource(&iomem_resource, &crashk_res);
+#endif
}
+#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
static __init int insert_crashkernel_resources(void)
{
if (crashk_res.start < crashk_res.end)
@@ -462,3 +468,4 @@ static __init int insert_crashkernel_resources(void)
}
early_initcall(insert_crashkernel_resources);
#endif
+#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ad3eaf2ab959..bf9ae8a8686f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1643,8 +1643,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!((old->flags & new->flags) & IRQF_SHARED) ||
- (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
- ((old->flags ^ new->flags) & IRQF_ONESHOT))
+ (oldtype != (new->flags & IRQF_TRIGGER_MASK)))
+ goto mismatch;
+
+ if ((old->flags & IRQF_ONESHOT) &&
+ (new->flags & IRQF_COND_ONESHOT))
+ new->flags |= IRQF_ONESHOT;
+ else if ((old->flags ^ new->flags) & IRQF_ONESHOT)
goto mismatch;
/* All handlers must agree on per-cpuness */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9d9095e81792..65adc815fc6e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1567,10 +1567,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
jump_label_lock();
preempt_disable();
- /* Ensure it is not in reserved area nor out of text */
- if (!(core_kernel_text((unsigned long) p->addr) ||
- is_module_text_address((unsigned long) p->addr)) ||
- in_gate_area_no_mm((unsigned long) p->addr) ||
+ /* Ensure the address is in a text area, and find a module if exists. */
+ *probed_mod = NULL;
+ if (!core_kernel_text((unsigned long) p->addr)) {
+ *probed_mod = __module_text_address((unsigned long) p->addr);
+ if (!(*probed_mod)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ /* Ensure it is not in reserved area. */
+ if (in_gate_area_no_mm((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
static_call_text_reserved(p->addr, p->addr) ||
@@ -1580,8 +1587,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
goto out;
}
- /* Check if 'p' is probing a module. */
- *probed_mod = __module_text_address((unsigned long) p->addr);
+ /* Get module refcount and reject __init functions for loaded modules. */
if (*probed_mod) {
/*
* We must hold a refcount of the probed module while updating
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index c3ced519e14b..f3e0329337f6 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -236,6 +236,10 @@ choice
possible to load a signed module containing the algorithm to check
the signature on that module.
+config MODULE_SIG_SHA1
+ bool "Sign modules with SHA-1"
+ select CRYPTO_SHA1
+
config MODULE_SIG_SHA256
bool "Sign modules with SHA-256"
select CRYPTO_SHA256
@@ -265,6 +269,7 @@ endchoice
config MODULE_SIG_HASH
string
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default "sha1" if MODULE_SIG_SHA1
default "sha256" if MODULE_SIG_SHA256
default "sha384" if MODULE_SIG_SHA384
default "sha512" if MODULE_SIG_SHA512
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index e3ae93bbcb9b..09f8397bae15 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -106,6 +106,12 @@ static void s2idle_enter(void)
swait_event_exclusive(s2idle_wait_head,
s2idle_state == S2IDLE_STATE_WAKE);
+ /*
+ * Kick all CPUs to ensure that they resume their timers and restore
+ * consistent system state.
+ */
+ wake_up_all_idle_cpus();
+
cpus_read_unlock();
raw_spin_lock_irq(&s2idle_lock);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ca5146006b94..adf99c05adca 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2009,6 +2009,12 @@ static int console_trylock_spinning(void)
*/
mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
+ /*
+ * Update @console_may_schedule for trylock because the previous
+ * owner may have been schedulable.
+ */
+ console_may_schedule = 0;
+
return 1;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index f8e543f1e38a..8bb106a56b3a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2408,8 +2408,11 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
return -EINVAL;
- /* PARISC cannot allow mdwe as it needs writable stacks */
- if (IS_ENABLED(CONFIG_PARISC))
+ /*
+ * EOPNOTSUPP might be more appropriate here in principle, but
+ * existing userspace depends on EINVAL specifically.
+ */
+ if (!arch_memory_deny_write_exec_supported())
return -EINVAL;
current_bits = get_current_mdwe();
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 9de66bbbb3d1..4782edcbe7b9 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -129,15 +129,17 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
goto out;
}
pccontext->clk = clk;
- fp->private_data = pccontext;
- if (clk->ops.open)
+ if (clk->ops.open) {
err = clk->ops.open(pccontext, fp->f_mode);
- else
- err = 0;
-
- if (!err) {
- get_device(clk->dev);
+ if (err) {
+ kfree(pccontext);
+ goto out;
+ }
}
+
+ fp->private_data = pccontext;
+ get_device(clk->dev);
+ err = 0;
out:
up_read(&clk->rwsem);
return err;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 269e21590df5..1331216a9cae 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -697,6 +697,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu)
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
+ * @now: current ktime_t
*
* Called from interrupt entry when the CPU was idle
*
@@ -794,7 +795,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
- * This function returns -1 if NOHZ is not enabled.
+ * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
*/
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
@@ -820,7 +821,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
- * This function returns -1 if NOHZ is not enabled.
+ * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
*/
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
@@ -1287,6 +1288,8 @@ void tick_nohz_irq_exit(void)
/**
* tick_nohz_idle_got_tick - Check whether or not the tick handler has run
+ *
+ * Return: %true if the tick handler has run, otherwise %false
*/
bool tick_nohz_idle_got_tick(void)
{
@@ -1305,6 +1308,8 @@ bool tick_nohz_idle_got_tick(void)
* stopped, it returns the next hrtimer.
*
* Called from power state control code with interrupts disabled
+ *
+ * Return: the next expiration time
*/
ktime_t tick_nohz_get_next_hrtimer(void)
{
@@ -1320,6 +1325,8 @@ ktime_t tick_nohz_get_next_hrtimer(void)
* The return value of this function and/or the value returned by it through the
* @delta_next pointer can be negative which must be taken into account by its
* callers.
+ *
+ * Return: the expected length of the current sleep
*/
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
@@ -1357,8 +1364,11 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
/**
* tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
* for a particular CPU.
+ * @cpu: target CPU number
*
* Called from the schedutil frequency scaling governor in scheduler context.
+ *
+ * Return: the current idle calls counter value for @cpu
*/
unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
{
@@ -1371,6 +1381,8 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
* tick_nohz_get_idle_calls - return the current idle calls counter value
*
* Called from the schedutil frequency scaling governor in scheduler context.
+ *
+ * Return: the current idle calls counter value for the current CPU
*/
unsigned long tick_nohz_get_idle_calls(void)
{
@@ -1559,7 +1571,7 @@ early_param("skew_tick", skew_tick);
/**
* tick_setup_sched_timer - setup the tick emulation timer
- * @mode: tick_nohz_mode to setup for
+ * @hrtimer: whether to use the hrtimer or not
*/
void tick_setup_sched_timer(bool hrtimer)
{
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index e11c4dc65bcb..b4a7822f495d 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -46,8 +46,8 @@ struct tick_device {
* @next_tick: Next tick to be fired when in dynticks mode.
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_waketime: Time when the idle was interrupted
+ * @idle_sleeptime_seq: sequence counter for data consistency
* @idle_entrytime: Time when the idle call was entered
- * @nohz_mode: Mode - one state of tick_nohz_mode
* @last_jiffies: Base jiffies snapshot when next event was last computed
* @timer_expires_base: Base time clock monotonic for @timer_expires
* @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index dee29f1f5b75..3baf2fbe6848 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -64,15 +64,15 @@ EXPORT_SYMBOL(jiffies_64);
/*
* The timer wheel has LVL_DEPTH array levels. Each level provides an array of
- * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
* level has a different granularity.
*
- * The level granularity is: LVL_CLK_DIV ^ lvl
+ * The level granularity is: LVL_CLK_DIV ^ level
* The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
*
* The array level of a newly armed timer depends on the relative expiry
* time. The farther the expiry time is away the higher the array level and
- * therefor the granularity becomes.
+ * therefore the granularity becomes.
*
* Contrary to the original timer wheel implementation, which aims for 'exact'
* expiry of the timers, this implementation removes the need for recascading
@@ -207,7 +207,7 @@ EXPORT_SYMBOL(jiffies_64);
* struct timer_base - Per CPU timer base (number of base depends on config)
* @lock: Lock protecting the timer_base
* @running_timer: When expiring timers, the lock is dropped. To make
- * sure not to race agains deleting/modifying a
+ * sure not to race against deleting/modifying a
* currently running timer, the pointer is set to the
* timer, which expires at the moment. If no timer is
* running, the pointer is NULL.
@@ -737,7 +737,7 @@ static bool timer_is_static_object(void *addr)
}
/*
- * fixup_init is called when:
+ * timer_fixup_init is called when:
* - an active object is initialized
*/
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
@@ -761,7 +761,7 @@ static void stub_timer(struct timer_list *unused)
}
/*
- * fixup_activate is called when:
+ * timer_fixup_activate is called when:
* - an active object is activated
* - an unknown non-static object is activated
*/
@@ -783,7 +783,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
}
/*
- * fixup_free is called when:
+ * timer_fixup_free is called when:
* - an active object is freed
*/
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
@@ -801,7 +801,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state)
}
/*
- * fixup_assert_init is called when:
+ * timer_fixup_assert_init is called when:
* - an untracked/uninit-ed object is found
*/
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
@@ -914,7 +914,7 @@ static void do_init_timer(struct timer_list *timer,
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
*
- * init_timer_key() must be done to a timer prior calling *any* of the
+ * init_timer_key() must be done to a timer prior to calling *any* of the
* other timer functions.
*/
void init_timer_key(struct timer_list *timer,
@@ -1417,7 +1417,7 @@ static int __timer_delete(struct timer_list *timer, bool shutdown)
* If @shutdown is set then the lock has to be taken whether the
* timer is pending or not to protect against a concurrent rearm
* which might hit between the lockless pending check and the lock
- * aquisition. By taking the lock it is ensured that such a newly
+ * acquisition. By taking the lock it is ensured that such a newly
* enqueued timer is dequeued and cannot end up with
* timer->function == NULL in the expiry code.
*
@@ -2306,7 +2306,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
/*
* When timer base is not set idle, undo the effect of
- * tmigr_cpu_deactivate() to prevent inconsitent states - active
+ * tmigr_cpu_deactivate() to prevent inconsistent states - active
* timer base but inactive timer migration hierarchy.
*
* When timer base was already marked idle, nothing will be
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index c63a0afdcebe..ccba875d2234 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -751,6 +751,33 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
first_childevt = evt = data->evt;
+ /*
+ * Walking the hierarchy is required in any case when a
+ * remote expiry was done before. This ensures to not lose
+ * already queued events in non active groups (see section
+ * "Required event and timerqueue update after a remote
+ * expiry" in the documentation at the top).
+ *
+ * The two call sites which are executed without a remote expiry
+ * before, are not prevented from propagating changes through
+ * the hierarchy by the return:
+ * - When entering this path by tmigr_new_timer(), @evt->ignore
+ * is never set.
+ * - tmigr_inactive_up() takes care of the propagation by
+ * itself and ignores the return value. But an immediate
+ * return is possible if there is a parent, sparing group
+ * locking at this level, because the upper walking call to
+ * the parent will take care about removing this event from
+ * within the group and update next_expiry accordingly.
+ *
+ * However if there is no parent, ie: the hierarchy has only a
+ * single level so @group is the top level group, make sure the
+ * first event information of the group is updated properly and
+ * also handled properly, so skip this fast return path.
+ */
+ if (evt->ignore && !remote && group->parent)
+ return true;
+
raw_spin_lock(&group->lock);
childstate.state = 0;
@@ -762,8 +789,11 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
* queue when the expiry time changed only or when it could be ignored.
*/
if (timerqueue_node_queued(&evt->nextevt)) {
- if ((evt->nextevt.expires == nextexp) && !evt->ignore)
+ if ((evt->nextevt.expires == nextexp) && !evt->ignore) {
+ /* Make sure not to miss a new CPU event with the same expiry */
+ evt->cpu = first_childevt->cpu;
goto check_toplvl;
+ }
if (!timerqueue_del(&group->events, &evt->nextevt))
WRITE_ONCE(group->next_expiry, KTIME_MAX);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0a5c4efc73c3..9dc605f08a23 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2728,7 +2728,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
.release = bpf_kprobe_multi_link_release,
- .dealloc = bpf_kprobe_multi_link_dealloc,
+ .dealloc_deferred = bpf_kprobe_multi_link_dealloc,
.fill_link_info = bpf_kprobe_multi_link_fill_link_info,
};
@@ -3157,6 +3157,9 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+ if (umulti_link->task)
+ put_task_struct(umulti_link->task);
+ path_put(&umulti_link->path);
}
static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
@@ -3164,9 +3167,6 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
- if (umulti_link->task)
- put_task_struct(umulti_link->task);
- path_put(&umulti_link->path);
kvfree(umulti_link->uprobes);
kfree(umulti_link);
}
@@ -3242,7 +3242,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
.release = bpf_uprobe_multi_link_release,
- .dealloc = bpf_uprobe_multi_link_dealloc,
+ .dealloc_deferred = bpf_uprobe_multi_link_dealloc,
.fill_link_info = bpf_uprobe_multi_link_fill_link_info,
};
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 217169de0920..dfe3ee6035ec 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -839,7 +839,7 @@ out:
void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs)
{
struct probe_entry_arg *earg = tp->entry_arg;
- unsigned long val;
+ unsigned long val = 0;
int i;
if (!earg)