diff options
Diffstat (limited to 'kernel')
150 files changed, 7849 insertions, 2996 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index b69c95315480..3947122d618b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o ucount.o regset.o + async.o range.o smpboot.o ucount.o regset.o ksyms_common.o obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MULTIUSER) += groups.o @@ -91,7 +91,8 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_BUDDY) += watchdog_buddy.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_perf.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/audit.h b/kernel/audit.h index c57b008b9914..94738bce40b2 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -259,8 +259,8 @@ extern struct tty_struct *audit_get_tty(void); extern void audit_put_tty(struct tty_struct *tty); /* audit watch/mark/tree functions */ -#ifdef CONFIG_AUDITSYSCALL extern unsigned int audit_serial(void); +#ifdef CONFIG_AUDITSYSCALL extern int auditsc_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial); diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 540331b610a9..addf3dd57b59 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -86,9 +86,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) int numa_node = bpf_map_attr_numa_node(attr); struct bpf_bloom_filter *bloom; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - if (attr->key_size != 0 || attr->value_size == 0 || attr->max_entries == 0 || attr->map_flags & ~BLOOM_CREATE_FLAG_MASK || diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 47d9948d768f..b5149cfce7d4 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -723,9 +723,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!bpf_capable()) - return -EPERM; - if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) return -E2BIG; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index d99e89f113c4..3dabdd137d10 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -41,7 +41,12 @@ static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { - return node->ref; + return READ_ONCE(node->ref); +} + +static void bpf_lru_node_clear_ref(struct bpf_lru_node *node) +{ + WRITE_ONCE(node->ref, 0); } static void bpf_lru_list_count_inc(struct bpf_lru_list *l, @@ -89,7 +94,7 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; - node->ref = 0; + bpf_lru_node_clear_ref(node); list_move(&node->list, &l->lists[tgt_type]); } @@ -110,7 +115,7 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l, bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; } - node->ref = 0; + bpf_lru_node_clear_ref(node); /* If the moving node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. @@ -353,7 +358,7 @@ static void __local_list_add_pending(struct bpf_lru *lru, *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; - node->ref = 0; + bpf_lru_node_clear_ref(node); list_add(&node->list, local_pending_list(loc_l)); } @@ -419,7 +424,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, if (!list_empty(free_list)) { node = list_first_entry(free_list, struct bpf_lru_node, list); *(u32 *)((void *)node + lru->hash_offset) = hash; - node->ref = 0; + bpf_lru_node_clear_ref(node); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } @@ -522,7 +527,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, } node->type = BPF_LRU_LOCAL_LIST_T_FREE; - node->ref = 0; + bpf_lru_node_clear_ref(node); list_move(&node->list, local_free_list(loc_l)); raw_spin_unlock_irqrestore(&loc_l->lock, flags); @@ -568,7 +573,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; - node->ref = 0; + bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } @@ -594,7 +599,7 @@ again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; - node->ref = 0; + bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; buf += elem_size; diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 4ea227c9c1ad..8f3c8b2b4490 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -64,11 +64,8 @@ struct bpf_lru { static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) { - /* ref is an approximation on access frequency. It does not - * have to be very accurate. Hence, no protection is used. - */ - if (!node->ref) - node->ref = 1; + if (!READ_ONCE(node->ref)) + WRITE_ONCE(node->ref, 1); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d3f0a4825fa6..116a0ce378ec 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -655,9 +655,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) const struct btf_type *t, *vt; struct bpf_map *map; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); if (!st_ops) return ERR_PTR(-ENOTSUPP); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b682b8e4b50..29fe21099298 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -222,10 +222,17 @@ enum btf_kfunc_hook { enum { BTF_KFUNC_SET_MAX_CNT = 256, BTF_DTOR_KFUNC_MAX_CNT = 256, + BTF_KFUNC_FILTER_MAX_CNT = 16, +}; + +struct btf_kfunc_hook_filter { + btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT]; + u32 nr_filters; }; struct btf_kfunc_set_tab { struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX]; + struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX]; }; struct btf_id_dtor_kfunc_tab { @@ -485,25 +492,26 @@ static bool btf_type_is_fwd(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; } -static bool btf_type_nosize(const struct btf_type *t) +static bool btf_type_is_datasec(const struct btf_type *t) { - return btf_type_is_void(t) || btf_type_is_fwd(t) || - btf_type_is_func(t) || btf_type_is_func_proto(t); + return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } -static bool btf_type_nosize_or_null(const struct btf_type *t) +static bool btf_type_is_decl_tag(const struct btf_type *t) { - return !t || btf_type_nosize(t); + return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG; } -static bool btf_type_is_datasec(const struct btf_type *t) +static bool btf_type_nosize(const struct btf_type *t) { - return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; + return btf_type_is_void(t) || btf_type_is_fwd(t) || + btf_type_is_func(t) || btf_type_is_func_proto(t) || + btf_type_is_decl_tag(t); } -static bool btf_type_is_decl_tag(const struct btf_type *t) +static bool btf_type_nosize_or_null(const struct btf_type *t) { - return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG; + return !t || btf_type_nosize(t); } static bool btf_type_is_decl_tag_target(const struct btf_type *t) @@ -744,13 +752,12 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) return offset < btf->hdr.str_len; } -static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +static bool __btf_name_char_ok(char c, bool first) { if ((first ? !isalpha(c) : !isalnum(c)) && c != '_' && - ((c == '.' && !dot_ok) || - c != '.')) + c != '.') return false; return true; } @@ -767,20 +774,20 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset) return NULL; } -static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) +static bool __btf_name_valid(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); const char *src_limit; - if (!__btf_name_char_ok(*src, true, dot_ok)) + if (!__btf_name_char_ok(*src, true)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!__btf_name_char_ok(*src, false, dot_ok)) + if (!__btf_name_char_ok(*src, false)) return false; src++; } @@ -788,17 +795,14 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) return !*src; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, false); + return __btf_name_valid(btf, offset); } static bool btf_name_valid_section(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, true); + return __btf_name_valid(btf, offset); } static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) @@ -4422,7 +4426,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env, } if (!t->name_off || - !__btf_name_valid(env->btf, t->name_off, true)) { + !__btf_name_valid(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } @@ -7669,9 +7673,12 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) /* Kernel Function (kfunc) BTF ID set registration API */ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, - struct btf_id_set8 *add_set) + const struct btf_kfunc_id_set *kset) { + struct btf_kfunc_hook_filter *hook_filter; + struct btf_id_set8 *add_set = kset->set; bool vmlinux_set = !btf_is_module(btf); + bool add_filter = !!kset->filter; struct btf_kfunc_set_tab *tab; struct btf_id_set8 *set; u32 set_cnt; @@ -7686,6 +7693,24 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, return 0; tab = btf->kfunc_set_tab; + + if (tab && add_filter) { + u32 i; + + hook_filter = &tab->hook_filters[hook]; + for (i = 0; i < hook_filter->nr_filters; i++) { + if (hook_filter->filters[i] == kset->filter) { + add_filter = false; + break; + } + } + + if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) { + ret = -E2BIG; + goto end; + } + } + if (!tab) { tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN); if (!tab) @@ -7708,7 +7733,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, */ if (!vmlinux_set) { tab->sets[hook] = add_set; - return 0; + goto do_add_filter; } /* In case of vmlinux sets, there may be more than one set being @@ -7750,6 +7775,11 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL); +do_add_filter: + if (add_filter) { + hook_filter = &tab->hook_filters[hook]; + hook_filter->filters[hook_filter->nr_filters++] = kset->filter; + } return 0; end: btf_free_kfunc_set_tab(btf); @@ -7758,15 +7788,22 @@ end: static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, enum btf_kfunc_hook hook, - u32 kfunc_btf_id) + u32 kfunc_btf_id, + const struct bpf_prog *prog) { + struct btf_kfunc_hook_filter *hook_filter; struct btf_id_set8 *set; - u32 *id; + u32 *id, i; if (hook >= BTF_KFUNC_HOOK_MAX) return NULL; if (!btf->kfunc_set_tab) return NULL; + hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; + for (i = 0; i < hook_filter->nr_filters; i++) { + if (hook_filter->filters[i](prog, kfunc_btf_id)) + return NULL; + } set = btf->kfunc_set_tab->sets[hook]; if (!set) return NULL; @@ -7821,23 +7858,25 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) * protection for looking up a well-formed btf->kfunc_set_tab. */ u32 *btf_kfunc_id_set_contains(const struct btf *btf, - enum bpf_prog_type prog_type, - u32 kfunc_btf_id) + u32 kfunc_btf_id, + const struct bpf_prog *prog) { + enum bpf_prog_type prog_type = resolve_prog_type(prog); enum btf_kfunc_hook hook; u32 *kfunc_flags; - kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); + kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog); if (kfunc_flags) return kfunc_flags; hook = bpf_prog_type_to_kfunc_hook(prog_type); - return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); + return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog); } -u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id) +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, + const struct bpf_prog *prog) { - return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); + return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog); } static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, @@ -7868,7 +7907,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, goto err_out; } - ret = btf_populate_kfunc_set(btf, hook, kset->set); + ret = btf_populate_kfunc_set(btf, hook, kset); + err_out: btf_put(btf); return ret; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 517b6a5928cc..5b2741aa0d9b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1826,6 +1826,12 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, ret = 1; } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { /* optlen is out of bounds */ + if (*optlen > PAGE_SIZE && ctx.optlen >= 0) { + pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", + ctx.optlen, max_optlen); + ret = 0; + goto out; + } ret = -EFAULT; } else { /* optlen within bounds, run kernel handler */ @@ -1881,8 +1887,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, .optname = optname, .current_task = current, }; + int orig_optlen; int ret; + orig_optlen = max_optlen; ctx.optlen = max_optlen; max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) @@ -1905,6 +1913,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ret = -EFAULT; goto out; } + orig_optlen = ctx.optlen; if (copy_from_user(ctx.optval, optval, min(ctx.optlen, max_optlen)) != 0) { @@ -1922,6 +1931,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) { + if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) { + pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", + ctx.optlen, max_optlen); + ret = retval; + goto out; + } ret = -EFAULT; goto out; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7421487422d4..dc85240a0134 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2064,14 +2064,16 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), -static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, - const struct bpf_insn *insn) = { +static __maybe_unused +u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST +#ifdef CONFIG_BPF_SYSCALL void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); @@ -2080,7 +2082,7 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } - +#endif #else static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8ec18faa74ac..8a33e8747a0e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -28,7 +28,6 @@ #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/kthread.h> -#include <linux/capability.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> @@ -89,9 +88,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || (value_size != offsetofend(struct bpf_cpumap_val, qsize) && diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 7efdf5d770ca..938a60ff4295 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -132,6 +132,21 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) } /** + * bpf_cpumask_first_and() - Return the index of the first nonzero bit from the + * AND of two cpumasks. + * @src1: The first cpumask. + * @src2: The second cpumask. + * + * Find the index of the first nonzero bit of the AND of two cpumasks. + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +__bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1, + const struct cpumask *src2) +{ + return cpumask_first_and(src1, src2); +} + +/** * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask. * @cpu: The CPU to be set in the cpumask. * @cpumask: The BPF cpumask in which a bit is being set. @@ -367,7 +382,7 @@ __bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask } /** - * bpf_cpumask_any() - Return a random set CPU from a cpumask. + * bpf_cpumask_any_distribute() - Return a random set CPU from a cpumask. * @cpumask: The cpumask being queried. * * Return: @@ -376,26 +391,28 @@ __bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask * * A struct bpf_cpumask pointer may be safely passed to @src. */ -__bpf_kfunc u32 bpf_cpumask_any(const struct cpumask *cpumask) +__bpf_kfunc u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) { - return cpumask_any(cpumask); + return cpumask_any_distribute(cpumask); } /** - * bpf_cpumask_any_and() - Return a random set CPU from the AND of two - * cpumasks. + * bpf_cpumask_any_and_distribute() - Return a random set CPU from the AND of + * two cpumasks. * @src1: The first cpumask. * @src2: The second cpumask. * * Return: - * * A random set bit within [0, num_cpus) if at least one bit is set. + * * A random set bit within [0, num_cpus) from the AND of two cpumasks, if at + * least one bit is set. * * >= num_cpus if no bit is set. * * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. */ -__bpf_kfunc u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) +__bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) { - return cpumask_any_and(src1, src2); + return cpumask_any_and_distribute(src1, src2); } __diag_pop(); @@ -406,6 +423,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU) @@ -422,8 +440,8 @@ BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) -BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU) -BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_SET8_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 802692fa3905..49cc0b5671c6 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -160,9 +160,6 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) struct bpf_dtab *dtab; int err; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 00c253b84bf5..56d3da7d0bc6 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -422,12 +422,6 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !bpf_capable()) - /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_BPF. - */ - return -EPERM; - if (zero_seed && !capable(CAP_SYS_ADMIN)) /* Guard against local DoS, and discourage production use. */ return -EPERM; @@ -1215,7 +1209,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) - return ret; + goto err_lock_bucket; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1236,6 +1230,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value err: htab_unlock_bucket(htab, b, hash, flags); +err_lock_bucket: if (ret) htab_lru_push_free(htab, l_new); else if (l_old) @@ -1338,7 +1333,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) - return ret; + goto err_lock_bucket; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1361,6 +1356,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, ret = 0; err: htab_unlock_bucket(htab, b, hash, flags); +err_lock_bucket: if (l_new) bpf_lru_push_free(&htab->lru, &l_new->lru_node); return ret; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8d368fa353f9..9e80efa59a5d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1423,7 +1423,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) +static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -1443,11 +1443,18 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } -u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) +u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } +static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) +{ + u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; + + ptr->size = new_size | metadata; +} + int bpf_dynptr_check_size(u32 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; @@ -1469,7 +1476,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) { - u32 size = bpf_dynptr_get_size(ptr); + u32 size = __bpf_dynptr_size(ptr); if (len > size || offset > size - len) return -E2BIG; @@ -1563,7 +1570,7 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v enum bpf_dynptr_type type; int err; - if (!dst->data || bpf_dynptr_is_rdonly(dst)) + if (!dst->data || __bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); @@ -1619,7 +1626,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 if (err) return 0; - if (bpf_dynptr_is_rdonly(ptr)) + if (__bpf_dynptr_is_rdonly(ptr)) return 0; type = bpf_dynptr_get_type(ptr); @@ -1926,8 +1933,12 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta * bpf_refcount type so that it is emitted in vmlinux BTF */ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off); + if (!refcount_inc_not_zero((refcount_t *)ref)) + return NULL; - refcount_inc((refcount_t *)ref); + /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null + * in verifier.c + */ return (void *)p__refcounted_kptr; } @@ -1943,7 +1954,7 @@ static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head INIT_LIST_HEAD(h); if (!list_empty(n)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl(n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec); return -EINVAL; } @@ -2025,7 +2036,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, if (!RB_EMPTY_NODE(n)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl(n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec); return -EINVAL; } @@ -2142,6 +2153,22 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) return NULL; return cgrp; } + +/** + * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test + * task's membership of cgroup ancestry. + * @task: the task to be tested + * @ancestor: possible ancestor of @task's cgroup + * + * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. + * It follows all the same rules as cgroup_is_descendant, and only applies + * to the default hierarchy. + */ +__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, + struct cgroup *ancestor) +{ + return task_under_cgroup_hierarchy(task, ancestor); +} #endif /* CONFIG_CGROUPS */ /** @@ -2167,13 +2194,15 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @ptr: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer: User-provided buffer to copy contents into - * @buffer__szk: Size (in bytes) of the buffer. This is the length of the - * requested slice. This must be a constant. + * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__szk: Size (in bytes) of the buffer if present. This is the + * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * + * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * * If the intention is to write to the data slice, please use * bpf_dynptr_slice_rdwr. * @@ -2190,7 +2219,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset, - void *buffer, u32 buffer__szk) + void *buffer__opt, u32 buffer__szk) { enum bpf_dynptr_type type; u32 len = buffer__szk; @@ -2210,15 +2239,17 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: - return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer); + return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); case BPF_DYNPTR_TYPE_XDP: { void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); if (xdp_ptr) return xdp_ptr; - bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false); - return buffer; + if (!buffer__opt) + return NULL; + bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); + return buffer__opt; } default: WARN_ONCE(true, "unknown dynptr type %d\n", type); @@ -2230,13 +2261,15 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. * @ptr: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer: User-provided buffer to copy contents into - * @buffer__szk: Size (in bytes) of the buffer. This is the length of the - * requested slice. This must be a constant. + * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__szk: Size (in bytes) of the buffer if present. This is the + * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * + * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * * The returned pointer is writable and may point to either directly the dynptr * data at the requested offset or to the buffer if unable to obtain a direct * data pointer to (example: the requested slice is to the paged area of an skb @@ -2267,9 +2300,9 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset, - void *buffer, u32 buffer__szk) + void *buffer__opt, u32 buffer__szk) { - if (!ptr->data || bpf_dynptr_is_rdonly(ptr)) + if (!ptr->data || __bpf_dynptr_is_rdonly(ptr)) return NULL; /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. @@ -2294,7 +2327,59 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ - return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk); + return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk); +} + +__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end) +{ + u32 size; + + if (!ptr->data || start > end) + return -EINVAL; + + size = __bpf_dynptr_size(ptr); + + if (start > size || end > size) + return -ERANGE; + + ptr->offset += start; + bpf_dynptr_set_size(ptr, end - start); + + return 0; +} + +__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr) +{ + return !ptr->data; +} + +__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +{ + if (!ptr->data) + return false; + + return __bpf_dynptr_is_rdonly(ptr); +} + +__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) +{ + if (!ptr->data) + return -EINVAL; + + return __bpf_dynptr_size(ptr); +} + +__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr, + struct bpf_dynptr_kern *clone__uninit) +{ + if (!ptr->data) { + bpf_dynptr_set_null(clone__uninit); + return -EINVAL; + } + + *clone__uninit = *ptr; + + return 0; } __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) @@ -2325,7 +2410,7 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) @@ -2341,6 +2426,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_SET8_END(generic_btf_ids) @@ -2369,6 +2455,11 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_dynptr_adjust) +BTF_ID_FLAGS(func, bpf_dynptr_is_null) +BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) +BTF_ID_FLAGS(func, bpf_dynptr_size) +BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_SET8_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9948b542a470..4174f76133df 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -435,7 +435,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent, return ret; } -static int bpf_obj_do_pin(const char __user *pathname, void *raw, +static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, enum bpf_type type) { struct dentry *dentry; @@ -444,22 +444,21 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(AT_FDCWD, pathname, &path, 0); + dentry = user_path_create(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); - mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); - - ret = security_path_mknod(&path, dentry, mode, 0); - if (ret) - goto out; - dir = d_inode(path.dentry); if (dir->i_op != &bpf_dir_iops) { ret = -EPERM; goto out; } + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); + ret = security_path_mknod(&path, dentry, mode, 0); + if (ret) + goto out; + switch (type) { case BPF_TYPE_PROG: ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); @@ -478,7 +477,7 @@ out: return ret; } -int bpf_obj_pin_user(u32 ufd, const char __user *pathname) +int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname) { enum bpf_type type; void *raw; @@ -488,14 +487,14 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) if (IS_ERR(raw)) return PTR_ERR(raw); - ret = bpf_obj_do_pin(pathname, raw, type); + ret = bpf_obj_do_pin(path_fd, pathname, raw, type); if (ret != 0) bpf_any_put(raw, type); return ret; } -static void *bpf_obj_do_get(const char __user *pathname, +static void *bpf_obj_do_get(int path_fd, const char __user *pathname, enum bpf_type *type, int flags) { struct inode *inode; @@ -503,7 +502,7 @@ static void *bpf_obj_do_get(const char __user *pathname, void *raw; int ret; - ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path); + ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); @@ -527,7 +526,7 @@ out: return ERR_PTR(ret); } -int bpf_obj_get_user(const char __user *pathname, int flags) +int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; int f_flags; @@ -538,7 +537,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags) if (f_flags < 0) return f_flags; - raw = bpf_obj_do_get(pathname, &type, f_flags); + raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags); if (IS_ERR(raw)) return PTR_ERR(raw); diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 046ddff37a76..850494423530 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -62,9 +62,6 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, - "verifier log line truncated - local buffer too short\n"); - if (log->level == BPF_LOG_KERNEL) { bool newline = n > 0 && log->kbuf[n - 1] == '\n'; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e0d3ddf2037a..17c7e7782a1f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -544,9 +544,6 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || !(attr->map_flags & BPF_F_NO_PREALLOC) || diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 2c5c64c2a53b..cd5eafaba97e 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -69,9 +69,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { + struct bpf_array *inner_array_meta = + container_of(inner_map_meta, struct bpf_array, map); + struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map); + + inner_array_meta->index_mask = inner_array->index_mask; + inner_array_meta->elem_size = inner_array->elem_size; inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; - container_of(inner_map_meta, struct bpf_array, map)->index_mask = - container_of(inner_map, struct bpf_array, map)->index_mask; } fdput(f); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 410637c225fb..0668bcd7c926 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -211,9 +211,9 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) mem_cgroup_put(memcg); } -static void free_one(struct bpf_mem_cache *c, void *obj) +static void free_one(void *obj, bool percpu) { - if (c->percpu_size) { + if (percpu) { free_percpu(((void **)obj)[1]); kfree(obj); return; @@ -222,14 +222,19 @@ static void free_one(struct bpf_mem_cache *c, void *obj) kfree(obj); } -static void __free_rcu(struct rcu_head *head) +static void free_all(struct llist_node *llnode, bool percpu) { - struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); - struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); struct llist_node *pos, *t; llist_for_each_safe(pos, t, llnode) - free_one(c, pos); + free_one(pos, percpu); +} + +static void __free_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + + free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); atomic_set(&c->call_rcu_in_progress, 0); } @@ -432,7 +437,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) static void drain_mem_cache(struct bpf_mem_cache *c) { - struct llist_node *llnode, *t; + bool percpu = !!c->percpu_size; /* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in @@ -441,14 +446,10 @@ static void drain_mem_cache(struct bpf_mem_cache *c) * Except for waiting_for_gp list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) - free_one(c, llnode); - llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp)) - free_one(c, llnode); - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist)) - free_one(c, llnode); - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra)) - free_one(c, llnode); + free_all(__llist_del_all(&c->free_by_rcu), percpu); + free_all(llist_del_all(&c->waiting_for_gp), percpu); + free_all(__llist_del_all(&c->free_llist), percpu); + free_all(__llist_del_all(&c->free_llist_extra), percpu); } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index d9c9f45e3529..8a26cd8814c1 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -859,4 +859,4 @@ static int __init bpf_offload_init(void) return rhashtable_init(&offdevs, &offdevs_params); } -late_initcall(bpf_offload_init); +core_initcall(bpf_offload_init); diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index b56f9f3314fd..0c63bc2cd895 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -23,9 +23,9 @@ static void free_links_and_skel(void) static int preload(struct bpf_preload_info *obj) { - strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name)); + strscpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name)); obj[0].link = maps_link; - strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name)); + strscpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name)); obj[1].link = progs_link; return 0; } diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 601609164ef3..8d2ddcb7566b 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -7,7 +7,6 @@ #include <linux/bpf.h> #include <linux/list.h> #include <linux/slab.h> -#include <linux/capability.h> #include <linux/btf_ids.h> #include "percpu_freelist.h" @@ -46,9 +45,6 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!bpf_capable()) - return -EPERM; - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index cbf2d8d784b8..4b4f9670f1a9 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -151,9 +151,6 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b25fce425b2c..458bb80b14d5 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -74,9 +74,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!bpf_capable()) - return ERR_PTR(-EPERM); - if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 14f39c1e573e..a2aef900519c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -109,37 +109,6 @@ const struct bpf_map_ops bpf_map_offload_ops = { .map_mem_usage = bpf_map_offload_map_mem_usage, }; -static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) -{ - const struct bpf_map_ops *ops; - u32 type = attr->map_type; - struct bpf_map *map; - int err; - - if (type >= ARRAY_SIZE(bpf_map_types)) - return ERR_PTR(-EINVAL); - type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); - ops = bpf_map_types[type]; - if (!ops) - return ERR_PTR(-EINVAL); - - if (ops->map_alloc_check) { - err = ops->map_alloc_check(attr); - if (err) - return ERR_PTR(err); - } - if (attr->map_ifindex) - ops = &bpf_map_offload_ops; - if (!ops->map_mem_usage) - return ERR_PTR(-EINVAL); - map = ops->map_alloc(attr); - if (IS_ERR(map)) - return map; - map->ops = ops; - map->map_type = type; - return map; -} - static void bpf_map_write_active_inc(struct bpf_map *map) { atomic64_inc(&map->writecnt); @@ -1127,7 +1096,9 @@ free_map_tab: /* called via syscall */ static int map_create(union bpf_attr *attr) { + const struct bpf_map_ops *ops; int numa_node = bpf_map_attr_numa_node(attr); + u32 map_type = attr->map_type; struct bpf_map *map; int f_flags; int err; @@ -1158,9 +1129,85 @@ static int map_create(union bpf_attr *attr) return -EINVAL; /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ - map = find_and_alloc_map(attr); + map_type = attr->map_type; + if (map_type >= ARRAY_SIZE(bpf_map_types)) + return -EINVAL; + map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); + ops = bpf_map_types[map_type]; + if (!ops) + return -EINVAL; + + if (ops->map_alloc_check) { + err = ops->map_alloc_check(attr); + if (err) + return err; + } + if (attr->map_ifindex) + ops = &bpf_map_offload_ops; + if (!ops->map_mem_usage) + return -EINVAL; + + /* Intent here is for unprivileged_bpf_disabled to block BPF map + * creation for unprivileged users; other actions depend + * on fd availability and access to bpffs, so are dependent on + * object creation success. Even with unprivileged BPF disabled, + * capability checks are still carried out. + */ + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + return -EPERM; + + /* check privileged map type permissions */ + switch (map_type) { + case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: + case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: + /* unprivileged */ + break; + case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: + case BPF_MAP_TYPE_CGRP_STORAGE: + case BPF_MAP_TYPE_BLOOM_FILTER: + case BPF_MAP_TYPE_LPM_TRIE: + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + case BPF_MAP_TYPE_STACK_TRACE: + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_STRUCT_OPS: + case BPF_MAP_TYPE_CPUMAP: + if (!bpf_capable()) + return -EPERM; + break; + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + case BPF_MAP_TYPE_XSKMAP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + break; + default: + WARN(1, "unsupported map type %d", map_type); + return -EPERM; + } + + map = ops->map_alloc(attr); if (IS_ERR(map)) return PTR_ERR(map); + map->ops = ops; + map->map_type = map_type; err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); @@ -1931,6 +1978,11 @@ static int map_freeze(const union bpf_attr *attr) return -ENOTSUPP; } + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + fdput(f); + return -EPERM; + } + mutex_lock(&map->freeze_mutex); if (bpf_map_write_active(map)) { err = -EBUSY; @@ -1940,10 +1992,6 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!bpf_capable()) { - err = -EPERM; - goto err_put; - } WRITE_ONCE(map->frozen, true); err_put: @@ -2433,6 +2481,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_NETFILTER: + if (expected_attach_type == BPF_NETFILTER) + return 0; + return -EINVAL; case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: if (expected_attach_type) @@ -2502,7 +2554,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) struct btf *attach_btf = NULL; int err; char license[128]; - bool is_gpl; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; @@ -2521,15 +2572,15 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) !bpf_capable()) return -EPERM; - /* copy eBPF program license from user space */ - if (strncpy_from_bpfptr(license, - make_bpfptr(attr->license, uattr.is_kernel), - sizeof(license) - 1) < 0) - return -EFAULT; - license[sizeof(license) - 1] = 0; - - /* eBPF programs must be GPL compatible to use GPL-ed functions */ - is_gpl = license_is_gpl_compatible(license); + /* Intent here is for unprivileged_bpf_disabled to block BPF program + * creation for unprivileged users; other actions depend + * on fd availability and access to bpffs, so are dependent on + * object creation success. Even with unprivileged BPF disabled, + * capability checks are still carried out for these + * and other operations. + */ + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + return -EPERM; if (attr->insn_cnt == 0 || attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) @@ -2613,12 +2664,20 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) make_bpfptr(attr->insns, uattr.is_kernel), bpf_prog_insn_size(prog)) != 0) goto free_prog_sec; + /* copy eBPF program license from user space */ + if (strncpy_from_bpfptr(license, + make_bpfptr(attr->license, uattr.is_kernel), + sizeof(license) - 1) < 0) + goto free_prog_sec; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; prog->orig_prog = NULL; prog->jited = 0; atomic64_set(&prog->aux->refcnt, 1); - prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_dev_bound_init(prog, attr); @@ -2697,23 +2756,38 @@ free_prog: return err; } -#define BPF_OBJ_LAST_FIELD file_flags +#define BPF_OBJ_LAST_FIELD path_fd static int bpf_obj_pin(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) + int path_fd; + + if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD) + return -EINVAL; + + /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ + if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) return -EINVAL; - return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); + path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; + return bpf_obj_pin_user(attr->bpf_fd, path_fd, + u64_to_user_ptr(attr->pathname)); } static int bpf_obj_get(const union bpf_attr *attr) { + int path_fd; + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || - attr->file_flags & ~BPF_OBJ_FLAG_MASK) + attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD)) + return -EINVAL; + + /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ + if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) return -EINVAL; - return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), + path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; + return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname), attr->file_flags); } @@ -2777,28 +2851,31 @@ static void bpf_link_put_deferred(struct work_struct *work) bpf_link_free(link); } -/* bpf_link_put can be called from atomic context, but ensures that resources - * are freed from process context +/* bpf_link_put might be called from atomic context. It needs to be called + * from sleepable context in order to acquire sleeping locks during the process. */ void bpf_link_put(struct bpf_link *link) { if (!atomic64_dec_and_test(&link->refcnt)) return; - if (in_atomic()) { - INIT_WORK(&link->work, bpf_link_put_deferred); - schedule_work(&link->work); - } else { - bpf_link_free(link); - } + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); } EXPORT_SYMBOL(bpf_link_put); +static void bpf_link_put_direct(struct bpf_link *link) +{ + if (!atomic64_dec_and_test(&link->refcnt)) + return; + bpf_link_free(link); +} + static int bpf_link_release(struct inode *inode, struct file *filp) { struct bpf_link *link = filp->private_data; - bpf_link_put(link); + bpf_link_put_direct(link); return 0; } @@ -2968,10 +3045,17 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); + u32 target_btf_id, target_obj_id; + bpf_trampoline_unpack_key(tr_link->trampoline->key, + &target_obj_id, &target_btf_id); seq_printf(seq, - "attach_type:\t%d\n", - tr_link->attach_type); + "attach_type:\t%d\n" + "target_obj_id:\t%u\n" + "target_btf_id:\t%u\n", + tr_link->attach_type, + target_obj_id, + target_btf_id); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, @@ -3436,6 +3520,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; + case BPF_PROG_TYPE_KPROBE: + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + return 0; default: return 0; } @@ -4590,7 +4679,12 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) switch (prog->type) { case BPF_PROG_TYPE_EXT: + break; case BPF_PROG_TYPE_NETFILTER: + if (attr->link_create.attach_type != BPF_NETFILTER) { + ret = -EINVAL; + goto out; + } break; case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: @@ -4764,7 +4858,7 @@ out_put_progs: if (ret) bpf_prog_put(new_prog); out_put_link: - bpf_link_put(link); + bpf_link_put_direct(link); return ret; } @@ -4787,7 +4881,7 @@ static int link_detach(union bpf_attr *attr) else ret = -EOPNOTSUPP; - bpf_link_put(link); + bpf_link_put_direct(link); return ret; } @@ -4857,7 +4951,7 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr) fd = bpf_link_new_fd(link); if (fd < 0) - bpf_link_put(link); + bpf_link_put_direct(link); return fd; } @@ -4934,7 +5028,7 @@ static int bpf_iter_create(union bpf_attr *attr) return PTR_ERR(link); err = bpf_iter_new_fd(link); - bpf_link_put(link); + bpf_link_put_direct(link); return err; } @@ -5004,23 +5098,8 @@ out_prog_put: static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; - bool capable; int err; - capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled; - - /* Intent here is for unprivileged_bpf_disabled to block key object - * creation commands for unprivileged users; other actions depend - * of fd availability and access to bpffs, so are dependent on - * object creation success. Capabilities are later verified for - * operations such as load and map create, so even with unprivileged - * BPF disabled, capability checks are still carried out for these - * and other operations. - */ - if (!capable && - (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) - return -EPERM; - err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; @@ -5380,7 +5459,8 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, *(int *)table->data = unpriv_enable; } - unpriv_ebpf_notify(unpriv_enable); + if (write) + unpriv_ebpf_notify(unpriv_enable); return ret; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ac021bc43a66..78acf28d4873 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -251,11 +251,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a return tlinks; } -static void __bpf_tramp_image_put_deferred(struct work_struct *work) +static void bpf_tramp_image_free(struct bpf_tramp_image *im) { - struct bpf_tramp_image *im; - - im = container_of(work, struct bpf_tramp_image, work); bpf_image_ksym_del(&im->ksym); bpf_jit_free_exec(im->image); bpf_jit_uncharge_modmem(PAGE_SIZE); @@ -263,6 +260,14 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work) kfree_rcu(im, rcu); } +static void __bpf_tramp_image_put_deferred(struct work_struct *work) +{ + struct bpf_tramp_image *im; + + im = container_of(work, struct bpf_tramp_image, work); + bpf_tramp_image_free(im); +} + /* callback, fexit step 3 or fentry step 2 */ static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) { @@ -344,7 +349,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); } -static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) +static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) { struct bpf_tramp_image *im; struct bpf_ksym *ksym; @@ -371,7 +376,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); - snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx); + snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); bpf_image_ksym_add(image, ksym); return im; @@ -401,11 +406,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut err = unregister_fentry(tr, tr->cur_image->image); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; - tr->selector = 0; goto out; } - im = bpf_tramp_image_alloc(tr->key, tr->selector); + im = bpf_tramp_image_alloc(tr->key); if (IS_ERR(im)) { err = PTR_ERR(im); goto out; @@ -438,12 +442,11 @@ again: &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) - goto out; + goto out_free; set_memory_rox((long)im->image, 1); - WARN_ON(tr->cur_image && tr->selector == 0); - WARN_ON(!tr->cur_image && tr->selector); + WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); @@ -468,18 +471,21 @@ again: } #endif if (err) - goto out; + goto out_free; if (tr->cur_image) bpf_tramp_image_put(tr->cur_image); tr->cur_image = im; - tr->selector++; out: /* If any error happens, restore previous flags */ if (err) tr->flags = orig_flags; kfree(tlinks); return err; + +out_free: + bpf_tramp_image_free(im); + goto out; } static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fbcf5a4e2fcd..11e54dd8b6dd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -197,6 +197,7 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); static void specialize_kfunc(struct bpf_verifier_env *env, u32 func_id, u16 offset, unsigned long *addr); +static bool is_trusted_reg(const struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { @@ -240,6 +241,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } +static bool bpf_helper_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0; +} + static bool bpf_pseudo_call(const struct bpf_insn *insn) { return insn->code == (BPF_JMP | BPF_CALL) && @@ -273,11 +280,6 @@ struct bpf_call_arg_meta { struct btf_field *kptr_field; }; -struct btf_and_id { - struct btf *btf; - u32 btf_id; -}; - struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -296,10 +298,21 @@ struct bpf_kfunc_call_arg_meta { u64 value; bool found; } arg_constant; - union { - struct btf_and_id arg_obj_drop; - struct btf_and_id arg_refcount_acquire; - }; + + /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, + * generally to pass info about user-defined local kptr types to later + * verification logic + * bpf_obj_drop + * Record the local kptr type to be drop'd + * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) + * Record the local kptr type to be refcount_incr'd and use + * arg_owning_ref to determine whether refcount_acquire should be + * fallible + */ + struct btf *arg_btf; + u32 arg_btf_id; + bool arg_owning_ref; + struct { struct btf_field *field; } arg_list_head; @@ -309,6 +322,7 @@ struct bpf_kfunc_call_arg_meta { struct { enum bpf_dynptr_type type; u32 id; + u32 ref_obj_id; } initialized_dynptr; struct { u8 spi; @@ -429,8 +443,11 @@ static bool type_may_be_null(u32 type) return type & PTR_MAYBE_NULL; } -static bool reg_type_not_null(enum bpf_reg_type type) +static bool reg_not_null(const struct bpf_reg_state *reg) { + enum bpf_reg_type type; + + type = reg->type; if (type_may_be_null(type)) return false; @@ -440,6 +457,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_MAP_VALUE || type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || + (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || type == PTR_TO_MEM; } @@ -468,6 +486,13 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) return rec; } +static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) +{ + struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux; + + return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL; +} + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); @@ -515,6 +540,8 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_dynptr_data; } +static bool is_callback_calling_kfunc(u32 btf_id); + static bool is_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || @@ -524,6 +551,11 @@ static bool is_callback_calling_function(enum bpf_func_id func_id) func_id == BPF_FUNC_user_ringbuf_drain; } +static bool is_async_callback_calling_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_timer_set_callback; +} + static bool is_storage_get_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_storage_get || @@ -604,9 +636,9 @@ static const char *reg_type_str(struct bpf_verifier_env *env, type & PTR_TRUSTED ? "trusted_" : "" ); - snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", + snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); - return env->type_str_buf; + return env->tmp_str_buf; } static char slot_type_char[] = { @@ -847,11 +879,11 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi); static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type, int insn_idx) + enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type type; - int spi, i, id, err; + int spi, i, err; spi = dynptr_get_spi(env, reg); if (spi < 0) @@ -887,7 +919,13 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (dynptr_type_refcounted(type)) { /* The id is used to track proper releasing */ - id = acquire_reference_state(env, insn_idx); + int id; + + if (clone_ref_obj_id) + id = clone_ref_obj_id; + else + id = acquire_reference_state(env, insn_idx); + if (id < 0) return id; @@ -901,24 +939,15 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ return 0; } -static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) { - struct bpf_func_state *state = func(env, reg); - int spi, i; - - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; + int i; for (i = 0; i < BPF_REG_SIZE; i++) { state->stack[spi].slot_type[i] = STACK_INVALID; state->stack[spi - 1].slot_type[i] = STACK_INVALID; } - /* Invalidate any slices associated with this dynptr */ - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) - WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id)); - __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); @@ -945,6 +974,50 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re */ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; +} + +static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi, ref_obj_id, i; + + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + + if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { + invalidate_dynptr(env, state, spi); + return 0; + } + + ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; + + /* If the dynptr has a ref_obj_id, then we need to invalidate + * two things: + * + * 1) Any dynptrs with a matching ref_obj_id (clones) + * 2) Any slices derived from this dynptr. + */ + + /* Invalidate any slices associated with this dynptr */ + WARN_ON_ONCE(release_reference(env, ref_obj_id)); + + /* Invalidate any dynptr clones */ + for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id) + continue; + + /* it should always be the case that if the ref obj id + * matches then the stack slot also belongs to a + * dynptr + */ + if (state->stack[i].slot_type[0] != STACK_DYNPTR) { + verbose(env, "verifier internal error: misconfigured ref_obj_id\n"); + return -EFAULT; + } + if (state->stack[i].spilled_ptr.dynptr.first_slot) + invalidate_dynptr(env, state, i); + } return 0; } @@ -1254,6 +1327,12 @@ static bool is_spilled_reg(const struct bpf_stack_state *stack) return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL; } +static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack) +{ + return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL && + stack->spilled_ptr.type == SCALAR_VALUE; +} + static void scrub_spilled_slot(u8 *stype) { if (*stype != STACK_INVALID) @@ -3144,12 +3223,172 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) return btf_name_by_offset(desc_btf, func->name_off); } +static inline void bt_init(struct backtrack_state *bt, u32 frame) +{ + bt->frame = frame; +} + +static inline void bt_reset(struct backtrack_state *bt) +{ + struct bpf_verifier_env *env = bt->env; + + memset(bt, 0, sizeof(*bt)); + bt->env = env; +} + +static inline u32 bt_empty(struct backtrack_state *bt) +{ + u64 mask = 0; + int i; + + for (i = 0; i <= bt->frame; i++) + mask |= bt->reg_masks[i] | bt->stack_masks[i]; + + return mask == 0; +} + +static inline int bt_subprog_enter(struct backtrack_state *bt) +{ + if (bt->frame == MAX_CALL_FRAMES - 1) { + verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + bt->frame++; + return 0; +} + +static inline int bt_subprog_exit(struct backtrack_state *bt) +{ + if (bt->frame == 0) { + verbose(bt->env, "BUG subprog exit from frame 0\n"); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + bt->frame--; + return 0; +} + +static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg) +{ + bt->reg_masks[frame] |= 1 << reg; +} + +static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg) +{ + bt->reg_masks[frame] &= ~(1 << reg); +} + +static inline void bt_set_reg(struct backtrack_state *bt, u32 reg) +{ + bt_set_frame_reg(bt, bt->frame, reg); +} + +static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg) +{ + bt_clear_frame_reg(bt, bt->frame, reg); +} + +static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_masks[frame] |= 1ull << slot; +} + +static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_masks[frame] &= ~(1ull << slot); +} + +static inline void bt_set_slot(struct backtrack_state *bt, u32 slot) +{ + bt_set_frame_slot(bt, bt->frame, slot); +} + +static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot) +{ + bt_clear_frame_slot(bt, bt->frame, slot); +} + +static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame) +{ + return bt->reg_masks[frame]; +} + +static inline u32 bt_reg_mask(struct backtrack_state *bt) +{ + return bt->reg_masks[bt->frame]; +} + +static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame) +{ + return bt->stack_masks[frame]; +} + +static inline u64 bt_stack_mask(struct backtrack_state *bt) +{ + return bt->stack_masks[bt->frame]; +} + +static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) +{ + return bt->reg_masks[bt->frame] & (1 << reg); +} + +static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot) +{ + return bt->stack_masks[bt->frame] & (1ull << slot); +} + +/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ +static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) +{ + DECLARE_BITMAP(mask, 64); + bool first = true; + int i, n; + + buf[0] = '\0'; + + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i); + first = false; + buf += n; + buf_sz -= n; + if (buf_sz < 0) + break; + } +} +/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ +static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) +{ + DECLARE_BITMAP(mask, 64); + bool first = true; + int i, n; + + buf[0] = '\0'; + + bitmap_from_u64(mask, stack_mask); + for_each_set_bit(i, mask, 64) { + n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8); + first = false; + buf += n; + buf_sz -= n; + if (buf_sz < 0) + break; + } +} + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. + * + * @idx is an index of the instruction we are currently processing; + * @subseq_idx is an index of the subsequent instruction that: + * - *would be* executed next, if jump history is viewed in forward order; + * - *was* processed previously during backtracking. */ -static int backtrack_insn(struct bpf_verifier_env *env, int idx, - u32 *reg_mask, u64 *stack_mask) +static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, + struct backtrack_state *bt) { const struct bpf_insn_cbs cbs = { .cb_call = disasm_kfunc_name, @@ -3160,20 +3399,24 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, u8 class = BPF_CLASS(insn->code); u8 opcode = BPF_OP(insn->code); u8 mode = BPF_MODE(insn->code); - u32 dreg = 1u << insn->dst_reg; - u32 sreg = 1u << insn->src_reg; - u32 spi; + u32 dreg = insn->dst_reg; + u32 sreg = insn->src_reg; + u32 spi, i; if (insn->code == 0) return 0; if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); + fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt)); + verbose(env, "mark_precise: frame%d: regs=%s ", + bt->frame, env->tmp_str_buf); + fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); + verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (class == BPF_ALU || class == BPF_ALU64) { - if (!(*reg_mask & dreg)) + if (!bt_is_reg_set(bt, dreg)) return 0; if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { @@ -3181,8 +3424,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, * dreg needs precision after this insn * sreg needs precision before this insn */ - *reg_mask &= ~dreg; - *reg_mask |= sreg; + bt_clear_reg(bt, dreg); + bt_set_reg(bt, sreg); } else { /* dreg = K * dreg needs precision after this insn. @@ -3190,7 +3433,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, * as precise=true in this verifier state. * No further markings in parent are necessary */ - *reg_mask &= ~dreg; + bt_clear_reg(bt, dreg); } } else { if (BPF_SRC(insn->code) == BPF_X) { @@ -3198,15 +3441,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, * both dreg and sreg need precision * before this insn */ - *reg_mask |= sreg; + bt_set_reg(bt, sreg); } /* else dreg += K * dreg still needs precision before this insn */ } } else if (class == BPF_LDX) { - if (!(*reg_mask & dreg)) + if (!bt_is_reg_set(bt, dreg)) return 0; - *reg_mask &= ~dreg; + bt_clear_reg(bt, dreg); /* scalars can only be spilled into stack w/o losing precision. * Load from any other memory can be zero extended. @@ -3227,9 +3470,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } - *stack_mask |= 1ull << spi; + bt_set_slot(bt, spi); } else if (class == BPF_STX || class == BPF_ST) { - if (*reg_mask & dreg) + if (bt_is_reg_set(bt, dreg)) /* stx & st shouldn't be using _scalar_ dst_reg * to access memory. It means backtracking * encountered a case of pointer subtraction. @@ -3244,20 +3487,92 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } - if (!(*stack_mask & (1ull << spi))) + if (!bt_is_slot_set(bt, spi)) return 0; - *stack_mask &= ~(1ull << spi); + bt_clear_slot(bt, spi); if (class == BPF_STX) - *reg_mask |= sreg; + bt_set_reg(bt, sreg); } else if (class == BPF_JMP || class == BPF_JMP32) { - if (opcode == BPF_CALL) { - if (insn->src_reg == BPF_PSEUDO_CALL) - return -ENOTSUPP; - /* BPF helpers that invoke callback subprogs are - * equivalent to BPF_PSEUDO_CALL above + if (bpf_pseudo_call(insn)) { + int subprog_insn_idx, subprog; + + subprog_insn_idx = idx + insn->imm + 1; + subprog = find_subprog(env, subprog_insn_idx); + if (subprog < 0) + return -EFAULT; + + if (subprog_is_global(env, subprog)) { + /* check that jump history doesn't have any + * extra instructions from subprog; the next + * instruction after call to global subprog + * should be literally next instruction in + * caller program + */ + WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug"); + /* r1-r5 are invalidated after subprog call, + * so for global func call it shouldn't be set + * anymore + */ + if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { + verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + /* global subprog always sets R0 */ + bt_clear_reg(bt, BPF_REG_0); + return 0; + } else { + /* static subprog call instruction, which + * means that we are exiting current subprog, + * so only r1-r5 could be still requested as + * precise, r0 and r6-r10 or any stack slot in + * the current frame should be zero by now + */ + if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { + verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + /* we don't track register spills perfectly, + * so fallback to force-precise instead of failing */ + if (bt_stack_mask(bt) != 0) + return -ENOTSUPP; + /* propagate r1-r5 to the caller */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) { + if (bt_is_reg_set(bt, i)) { + bt_clear_reg(bt, i); + bt_set_frame_reg(bt, bt->frame - 1, i); + } + } + if (bt_subprog_exit(bt)) + return -EFAULT; + return 0; + } + } else if ((bpf_helper_call(insn) && + is_callback_calling_function(insn->imm) && + !is_async_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) { + /* callback-calling helper or kfunc call, which means + * we are exiting from subprog, but unlike the subprog + * call handling above, we shouldn't propagate + * precision of r1-r5 (if any requested), as they are + * not actually arguments passed directly to callback + * subprogs */ - if (insn->src_reg == 0 && is_callback_calling_function(insn->imm)) + if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { + verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + if (bt_stack_mask(bt) != 0) return -ENOTSUPP; + /* clear r1-r5 in callback subprog's mask */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + bt_clear_reg(bt, i); + if (bt_subprog_exit(bt)) + return -EFAULT; + return 0; + } else if (opcode == BPF_CALL) { /* kfunc with imm==0 is invalid and fixup_kfunc_call will * catch this error later. Make backtracking conservative * with ENOTSUPP. @@ -3265,19 +3580,51 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0) return -ENOTSUPP; /* regular helper call sets R0 */ - *reg_mask &= ~1; - if (*reg_mask & 0x3f) { + bt_clear_reg(bt, BPF_REG_0); + if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { /* if backtracing was looking for registers R1-R5 * they should have been found already. */ - verbose(env, "BUG regs %x\n", *reg_mask); + verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } } else if (opcode == BPF_EXIT) { - return -ENOTSUPP; + bool r0_precise; + + if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { + /* if backtracing was looking for registers R1-R5 + * they should have been found already. + */ + verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + + /* BPF_EXIT in subprog or callback always returns + * right after the call instruction, so by checking + * whether the instruction at subseq_idx-1 is subprog + * call or not we can distinguish actual exit from + * *subprog* from exit from *callback*. In the former + * case, we need to propagate r0 precision, if + * necessary. In the former we never do that. + */ + r0_precise = subseq_idx - 1 >= 0 && + bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) && + bt_is_reg_set(bt, BPF_REG_0); + + bt_clear_reg(bt, BPF_REG_0); + if (bt_subprog_enter(bt)) + return -EFAULT; + + if (r0_precise) + bt_set_reg(bt, BPF_REG_0); + /* r6-r9 and stack slots will stay set in caller frame + * bitmasks until we return back from callee(s) + */ + return 0; } else if (BPF_SRC(insn->code) == BPF_X) { - if (!(*reg_mask & (dreg | sreg))) + if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg)) return 0; /* dreg <cond> sreg * Both dreg and sreg need precision before @@ -3285,7 +3632,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, * before it would be equally necessary to * propagate it to dreg. */ - *reg_mask |= (sreg | dreg); + bt_set_reg(bt, dreg); + bt_set_reg(bt, sreg); /* else dreg <cond> K * Only dreg still needs precision before * this insn, so for the K-based conditional @@ -3293,9 +3641,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, */ } } else if (class == BPF_LD) { - if (!(*reg_mask & dreg)) + if (!bt_is_reg_set(bt, dreg)) return 0; - *reg_mask &= ~dreg; + bt_clear_reg(bt, dreg); /* It's ld_imm64 or ld_abs or ld_ind. * For ld_imm64 no further tracking of precision * into parent is necessary @@ -3366,6 +3714,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, struct bpf_reg_state *reg; int i, j; + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n", + st->curframe); + } + /* big hammer: mark all scalars precise in this path. * pop_stack may still get !precise scalars. * We also skip current state and go straight to first parent state, @@ -3377,17 +3730,25 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, func = st->frame[i]; for (j = 0; j < BPF_REG_FP; j++) { reg = &func->regs[j]; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE || reg->precise) continue; reg->precise = true; + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "force_precise: frame%d: forcing r%d to be precise\n", + i, j); + } } for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { if (!is_spilled_reg(&func->stack[j])) continue; reg = &func->stack[j].spilled_ptr; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE || reg->precise) continue; reg->precise = true; + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n", + i, -(j + 1) * 8); + } } } } @@ -3418,6 +3779,96 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ } } +static bool idset_contains(struct bpf_idset *s, u32 id) +{ + u32 i; + + for (i = 0; i < s->count; ++i) + if (s->ids[i] == id) + return true; + + return false; +} + +static int idset_push(struct bpf_idset *s, u32 id) +{ + if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids))) + return -EFAULT; + s->ids[s->count++] = id; + return 0; +} + +static void idset_reset(struct bpf_idset *s) +{ + s->count = 0; +} + +/* Collect a set of IDs for all registers currently marked as precise in env->bt. + * Mark all registers with these IDs as precise. + */ +static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_idset *precise_ids = &env->idset_scratch; + struct backtrack_state *bt = &env->bt; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + DECLARE_BITMAP(mask, 64); + int i, fr; + + idset_reset(precise_ids); + + for (fr = bt->frame; fr >= 0; fr--) { + func = st->frame[fr]; + + bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (!reg->id || reg->type != SCALAR_VALUE) + continue; + if (idset_push(precise_ids, reg->id)) + return -EFAULT; + } + + bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); + for_each_set_bit(i, mask, 64) { + if (i >= func->allocated_stack / BPF_REG_SIZE) + break; + if (!is_spilled_scalar_reg(&func->stack[i])) + continue; + reg = &func->stack[i].spilled_ptr; + if (!reg->id) + continue; + if (idset_push(precise_ids, reg->id)) + return -EFAULT; + } + } + + for (fr = 0; fr <= st->curframe; ++fr) { + func = st->frame[fr]; + + for (i = BPF_REG_0; i < BPF_REG_10; ++i) { + reg = &func->regs[i]; + if (!reg->id) + continue; + if (!idset_contains(precise_ids, reg->id)) + continue; + bt_set_frame_reg(bt, fr, i); + } + for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) { + if (!is_spilled_scalar_reg(&func->stack[i])) + continue; + reg = &func->stack[i].spilled_ptr; + if (!reg->id) + continue; + if (!idset_contains(precise_ids, reg->id)) + continue; + bt_set_frame_slot(bt, fr, i); + } + } + + return 0; +} + /* * __mark_chain_precision() backtracks BPF program instruction sequence and * chain of verifier states making sure that register *regno* (if regno >= 0) @@ -3505,62 +3956,74 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * mark_all_scalars_imprecise() to hopefully get more permissive and generic * finalized states which help in short circuiting more future states. */ -static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, - int spi) +static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) { + struct backtrack_state *bt = &env->bt; struct bpf_verifier_state *st = env->cur_state; int first_idx = st->first_insn_idx; int last_idx = env->insn_idx; + int subseq_idx = -1; struct bpf_func_state *func; struct bpf_reg_state *reg; - u32 reg_mask = regno >= 0 ? 1u << regno : 0; - u64 stack_mask = spi >= 0 ? 1ull << spi : 0; bool skip_first = true; - bool new_marks = false; - int i, err; + int i, fr, err; if (!env->bpf_capable) return 0; + /* set frame number from which we are starting to backtrack */ + bt_init(bt, env->cur_state->curframe); + /* Do sanity checks against current state of register and/or stack * slot, but don't set precise flag in current state, as precision * tracking in the current state is unnecessary. */ - func = st->frame[frame]; + func = st->frame[bt->frame]; if (regno >= 0) { reg = &func->regs[regno]; if (reg->type != SCALAR_VALUE) { WARN_ONCE(1, "backtracing misuse"); return -EFAULT; } - new_marks = true; + bt_set_reg(bt, regno); } - while (spi >= 0) { - if (!is_spilled_reg(&func->stack[spi])) { - stack_mask = 0; - break; - } - reg = &func->stack[spi].spilled_ptr; - if (reg->type != SCALAR_VALUE) { - stack_mask = 0; - break; - } - new_marks = true; - break; - } - - if (!new_marks) - return 0; - if (!reg_mask && !stack_mask) + if (bt_empty(bt)) return 0; for (;;) { DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", + bt->frame, last_idx, first_idx, subseq_idx); + } + + /* If some register with scalar ID is marked as precise, + * make sure that all registers sharing this ID are also precise. + * This is needed to estimate effect of find_equal_scalars(). + * Do this at the last instruction of each state, + * bpf_reg_state::id fields are valid for these instructions. + * + * Allows to track precision in situation like below: + * + * r2 = unknown value + * ... + * --- state #0 --- + * ... + * r1 = r2 // r1 and r2 now share the same ID + * ... + * --- state #1 {r1.id = A, r2.id = A} --- + * ... + * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1 + * ... + * --- state #2 {r1.id = A, r2.id = A} --- + * r3 = r10 + * r3 += r1 // need to mark both r1 and r2 + */ + if (mark_precise_scalar_ids(env, st)) + return -EFAULT; if (last_idx < 0) { /* we are at the entry into subprog, which @@ -3571,12 +4034,13 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r if (st->curframe == 0 && st->frame[0]->subprogno > 0 && st->frame[0]->callsite == BPF_MAIN_FUNC && - stack_mask == 0 && (reg_mask & ~0x3e) == 0) { - bitmap_from_u64(mask, reg_mask); + bt_stack_mask(bt) == 0 && + (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) { + bitmap_from_u64(mask, bt_reg_mask(bt)); for_each_set_bit(i, mask, 32) { reg = &st->frame[0]->regs[i]; if (reg->type != SCALAR_VALUE) { - reg_mask &= ~(1u << i); + bt_clear_reg(bt, i); continue; } reg->precise = true; @@ -3584,8 +4048,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r return 0; } - verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n", - st->frame[0]->subprogno, reg_mask, stack_mask); + verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n", + st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } @@ -3595,15 +4059,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r err = 0; skip_first = false; } else { - err = backtrack_insn(env, i, ®_mask, &stack_mask); + err = backtrack_insn(env, i, subseq_idx, bt); } if (err == -ENOTSUPP) { - mark_all_scalars_precise(env, st); + mark_all_scalars_precise(env, env->cur_state); + bt_reset(bt); return 0; } else if (err) { return err; } - if (!reg_mask && !stack_mask) + if (bt_empty(bt)) /* Found assignment(s) into tracked register in this state. * Since this state is already marked, just return. * Nothing to be tracked further in the parent state. @@ -3611,6 +4076,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r return 0; if (i == first_idx) break; + subseq_idx = i; i = get_prev_insn_idx(st, i, &history); if (i >= env->prog->len) { /* This can happen if backtracking reached insn 0 @@ -3628,84 +4094,95 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r if (!st) break; - new_marks = false; - func = st->frame[frame]; - bitmap_from_u64(mask, reg_mask); - for_each_set_bit(i, mask, 32) { - reg = &func->regs[i]; - if (reg->type != SCALAR_VALUE) { - reg_mask &= ~(1u << i); - continue; + for (fr = bt->frame; fr >= 0; fr--) { + func = st->frame[fr]; + bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (reg->type != SCALAR_VALUE) { + bt_clear_frame_reg(bt, fr, i); + continue; + } + if (reg->precise) + bt_clear_frame_reg(bt, fr, i); + else + reg->precise = true; } - if (!reg->precise) - new_marks = true; - reg->precise = true; - } - bitmap_from_u64(mask, stack_mask); - for_each_set_bit(i, mask, 64) { - if (i >= func->allocated_stack / BPF_REG_SIZE) { - /* the sequence of instructions: - * 2: (bf) r3 = r10 - * 3: (7b) *(u64 *)(r3 -8) = r0 - * 4: (79) r4 = *(u64 *)(r10 -8) - * doesn't contain jmps. It's backtracked - * as a single block. - * During backtracking insn 3 is not recognized as - * stack access, so at the end of backtracking - * stack slot fp-8 is still marked in stack_mask. - * However the parent state may not have accessed - * fp-8 and it's "unallocated" stack space. - * In such case fallback to conservative. - */ - mark_all_scalars_precise(env, st); - return 0; - } + bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); + for_each_set_bit(i, mask, 64) { + if (i >= func->allocated_stack / BPF_REG_SIZE) { + /* the sequence of instructions: + * 2: (bf) r3 = r10 + * 3: (7b) *(u64 *)(r3 -8) = r0 + * 4: (79) r4 = *(u64 *)(r10 -8) + * doesn't contain jmps. It's backtracked + * as a single block. + * During backtracking insn 3 is not recognized as + * stack access, so at the end of backtracking + * stack slot fp-8 is still marked in stack_mask. + * However the parent state may not have accessed + * fp-8 and it's "unallocated" stack space. + * In such case fallback to conservative. + */ + mark_all_scalars_precise(env, env->cur_state); + bt_reset(bt); + return 0; + } - if (!is_spilled_reg(&func->stack[i])) { - stack_mask &= ~(1ull << i); - continue; + if (!is_spilled_scalar_reg(&func->stack[i])) { + bt_clear_frame_slot(bt, fr, i); + continue; + } + reg = &func->stack[i].spilled_ptr; + if (reg->precise) + bt_clear_frame_slot(bt, fr, i); + else + reg->precise = true; } - reg = &func->stack[i].spilled_ptr; - if (reg->type != SCALAR_VALUE) { - stack_mask &= ~(1ull << i); - continue; + if (env->log.level & BPF_LOG_LEVEL2) { + fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bt_frame_reg_mask(bt, fr)); + verbose(env, "mark_precise: frame%d: parent state regs=%s ", + fr, env->tmp_str_buf); + fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bt_frame_stack_mask(bt, fr)); + verbose(env, "stack=%s: ", env->tmp_str_buf); + print_verifier_state(env, func, true); } - if (!reg->precise) - new_marks = true; - reg->precise = true; - } - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "parent %s regs=%x stack=%llx marks:", - new_marks ? "didn't have" : "already had", - reg_mask, stack_mask); - print_verifier_state(env, func, true); } - if (!reg_mask && !stack_mask) - break; - if (!new_marks) - break; + if (bt_empty(bt)) + return 0; + subseq_idx = first_idx; last_idx = st->last_insn_idx; first_idx = st->first_insn_idx; } + + /* if we still have requested precise regs or slots, we missed + * something (e.g., stack access through non-r10 register), so + * fallback to marking all precise + */ + if (!bt_empty(bt)) { + mark_all_scalars_precise(env, env->cur_state); + bt_reset(bt); + } + return 0; } int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, env->cur_state->curframe, regno, -1); -} - -static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno) -{ - return __mark_chain_precision(env, frame, regno, -1); + return __mark_chain_precision(env, regno); } -static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi) +/* mark_chain_precision_batch() assumes that env->bt is set in the caller to + * desired reg and stack masks across all relevant frames + */ +static int mark_chain_precision_batch(struct bpf_verifier_env *env) { - return __mark_chain_precision(env, frame, -1, spi); + return __mark_chain_precision(env, -1); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -3868,6 +4345,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; } save_register_state(state, spi, reg, size); + /* Break the relation on a narrowing spill. */ + if (fls64(reg->umax_value) > BITS_PER_BYTE * size) + state->stack[spi].spilled_ptr.id = 0; } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && insn->imm != 0 && env->bpf_capable) { struct bpf_reg_state fake_reg = {}; @@ -4067,6 +4547,7 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, for (i = min_off; i < max_off; i++) { slot = -i - 1; spi = slot / BPF_REG_SIZE; + mark_stack_slot_scratched(env, spi); stype = ptr_state->stack[spi].slot_type; if (stype[slot % BPF_REG_SIZE] != STACK_ZERO) break; @@ -4118,6 +4599,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; + mark_stack_slot_scratched(env, spi); + if (is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; @@ -5534,7 +6017,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, * program allocated objects (which always have ref_obj_id > 0), * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC. */ - if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) { verbose(env, "only read is supported\n"); return -EACCES; } @@ -6677,7 +7160,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * type, and declare it as 'const struct bpf_dynptr *' in their prototype. */ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, - enum bpf_arg_type arg_type) + enum bpf_arg_type arg_type, int clone_ref_obj_id) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; int err; @@ -6721,7 +7204,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn return err; } - err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx); + err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id); } else /* MEM_RDONLY and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { @@ -7143,14 +7626,18 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL * + * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type. + * * Therefore we fold these flags depending on the arg_type before comparison. */ if (arg_type & MEM_RDONLY) type &= ~MEM_RDONLY; if (arg_type & PTR_MAYBE_NULL) type &= ~PTR_MAYBE_NULL; + if (base_type(arg_type) == ARG_PTR_TO_MEM) + type &= ~DYNPTR_TYPE_FLAG_MASK; - if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC) + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) type &= ~MEM_ALLOC; for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { @@ -7631,7 +8118,7 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, regno, insn_idx, arg_type); + err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); if (err) return err; break; @@ -8178,17 +8665,13 @@ static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); -static bool is_callback_calling_kfunc(u32 btf_id); - static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx, int subprog, set_callee_state_fn set_callee_state_cb) { struct bpf_verifier_state *state = env->cur_state; - struct bpf_func_info_aux *func_info_aux; struct bpf_func_state *caller, *callee; int err; - bool is_global = false; if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", @@ -8203,13 +8686,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } - func_info_aux = env->prog->aux->func_info_aux; - if (func_info_aux) - is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; - if (is_global) { + if (subprog_is_global(env, subprog)) { if (err) { verbose(env, "Caller passes invalid args into func#%d\n", subprog); @@ -9324,11 +9804,6 @@ static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_ACQUIRE; } -static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_RET_NULL; -} - static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_RELEASE; @@ -9398,6 +9873,11 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf, return __kfunc_param_match_suffix(btf, arg, "__szk"); } +static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__opt"); +} + static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { return __kfunc_param_match_suffix(btf, arg, "__k"); @@ -9595,6 +10075,7 @@ enum special_kfunc_type { KF_bpf_dynptr_from_xdp, KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, + KF_bpf_dynptr_clone, }; BTF_SET_START(special_kfunc_set) @@ -9614,6 +10095,7 @@ BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) +BTF_ID(func, bpf_dynptr_clone) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -9635,6 +10117,17 @@ BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) +BTF_ID(func, bpf_dynptr_clone) + +static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) +{ + if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && + meta->arg_owning_ref) { + return false; + } + + return meta->kfunc_flags & KF_RET_NULL; +} static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) { @@ -10113,6 +10606,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, node_off, btf_name_by_offset(reg->btf, t->name_off)); return -EINVAL; } + meta->arg_btf = reg->btf; + meta->arg_btf_id = reg->btf_id; if (node_off != field->graph_root.node_offset) { verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n", @@ -10323,13 +10818,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (meta->btf == btf_vmlinux && meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { - meta->arg_obj_drop.btf = reg->btf; - meta->arg_obj_drop.btf_id = reg->btf_id; + meta->arg_btf = reg->btf; + meta->arg_btf_id = reg->btf_id; } break; case KF_ARG_PTR_TO_DYNPTR: { enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; + int clone_ref_obj_id = 0; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { @@ -10343,12 +10839,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; - if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { dynptr_arg_type |= DYNPTR_TYPE_SKB; - else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && + (dynptr_arg_type & MEM_UNINIT)) { + enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; - ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type); + if (parent_type == BPF_DYNPTR_TYPE_INVALID) { + verbose(env, "verifier internal error: no dynptr type for parent of clone\n"); + return -EFAULT; + } + + dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); + clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; + if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { + verbose(env, "verifier internal error: missing ref obj id for parent of clone\n"); + return -EFAULT; + } + } + + ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); if (ret < 0) return ret; @@ -10361,6 +10873,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } meta->initialized_dynptr.id = id; meta->initialized_dynptr.type = dynptr_get_type(env, reg); + meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg); } break; @@ -10464,13 +10977,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_MEM_SIZE: { + struct bpf_reg_state *buff_reg = ®s[regno]; + const struct btf_param *buff_arg = &args[i]; struct bpf_reg_state *size_reg = ®s[regno + 1]; const struct btf_param *size_arg = &args[i + 1]; - ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); - if (ret < 0) { - verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); - return ret; + if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) { + ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); + if (ret < 0) { + verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); + return ret; + } } if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) { @@ -10494,10 +11011,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: - if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) { + if (!type_is_ptr_alloc_obj(reg->type)) { verbose(env, "arg#%d is neither owning or non-owning ref\n", i); return -EINVAL; } + if (!type_is_non_owning_ref(reg->type)) + meta->arg_owning_ref = true; rec = reg_btf_record(reg); if (!rec) { @@ -10513,8 +11032,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "bpf_refcount_acquire calls are disabled for now\n"); return -EINVAL; } - meta->arg_refcount_acquire.btf = reg->btf; - meta->arg_refcount_acquire.btf_id = reg->btf_id; + meta->arg_btf = reg->btf; + meta->arg_btf_id = reg->btf_id; break; } } @@ -10555,7 +11074,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, *kfunc_name = func_name; func_proto = btf_type_by_id(desc_btf, func->type); - kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id); + kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog); if (!kfunc_flags) { return -EACCES; } @@ -10660,6 +11179,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; insn_aux->insert_off = regs[BPF_REG_2].off; + insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", @@ -10746,12 +11266,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = meta.arg_refcount_acquire.btf; - regs[BPF_REG_0].btf_id = meta.arg_refcount_acquire.btf_id; + regs[BPF_REG_0].btf = meta.arg_btf; + regs[BPF_REG_0].btf_id = meta.arg_btf_id; insn_aux->kptr_struct_meta = - btf_find_struct_meta(meta.arg_refcount_acquire.btf, - meta.arg_refcount_acquire.btf_id); + btf_find_struct_meta(meta.arg_btf, + meta.arg_btf_id); } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { struct btf_field *field = meta.arg_list_head.field; @@ -10881,8 +11401,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { insn_aux->kptr_struct_meta = - btf_find_struct_meta(meta.arg_obj_drop.btf, - meta.arg_obj_drop.btf_id); + btf_find_struct_meta(meta.arg_btf, + meta.arg_btf_id); } } } @@ -12417,12 +12937,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { struct bpf_reg_state *src_reg = regs + insn->src_reg; struct bpf_reg_state *dst_reg = regs + insn->dst_reg; + bool need_id = src_reg->type == SCALAR_VALUE && !src_reg->id && + !tnum_is_const(src_reg->var_off); if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 * copy register state to dest reg */ - if (src_reg->type == SCALAR_VALUE && !src_reg->id) + if (need_id) /* Assign src and dst registers the same ID * that will be used by find_equal_scalars() * to propagate min/max range. @@ -12441,7 +12963,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX; - if (is_src_reg_u32 && !src_reg->id) + if (is_src_reg_u32 && need_id) src_reg->id = ++env->id_gen; copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared if src_reg is not in u32 range otherwise @@ -12773,7 +13295,7 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, bool is_jmp32) { if (__is_pointer_value(false, reg)) { - if (!reg_type_not_null(reg->type)) + if (!reg_not_null(reg)) return -1; /* If pointer is valid tests against zero will fail so we can @@ -14597,8 +15119,9 @@ static bool range_within(struct bpf_reg_state *old, * So we look through our idmap to see if this old id has been seen before. If * so, we require the new id to match; otherwise, we add the id pair to the map. */ -static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) +static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) { + struct bpf_id_pair *map = idmap->map; unsigned int i; /* either both IDs should be set or both should be zero */ @@ -14609,20 +15132,34 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap) return true; for (i = 0; i < BPF_ID_MAP_SIZE; i++) { - if (!idmap[i].old) { + if (!map[i].old) { /* Reached an empty slot; haven't seen this id before */ - idmap[i].old = old_id; - idmap[i].cur = cur_id; + map[i].old = old_id; + map[i].cur = cur_id; return true; } - if (idmap[i].old == old_id) - return idmap[i].cur == cur_id; + if (map[i].old == old_id) + return map[i].cur == cur_id; + if (map[i].cur == cur_id) + return false; } /* We ran out of idmap slots, which should be impossible */ WARN_ON_ONCE(1); return false; } +/* Similar to check_ids(), but allocate a unique temporary ID + * for 'old_id' or 'cur_id' of zero. + * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid. + */ +static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) +{ + old_id = old_id ? old_id : ++idmap->tmp_id_gen; + cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen; + + return check_ids(old_id, cur_id, idmap); +} + static void clean_func_state(struct bpf_verifier_env *env, struct bpf_func_state *st) { @@ -14721,7 +15258,7 @@ next: static bool regs_exact(const struct bpf_reg_state *rold, const struct bpf_reg_state *rcur, - struct bpf_id_pair *idmap) + struct bpf_idmap *idmap) { return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_ids(rold->id, rcur->id, idmap) && @@ -14730,7 +15267,7 @@ static bool regs_exact(const struct bpf_reg_state *rold, /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) + struct bpf_reg_state *rcur, struct bpf_idmap *idmap) { if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ @@ -14767,15 +15304,42 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, switch (base_type(rold->type)) { case SCALAR_VALUE: - if (regs_exact(rold, rcur, idmap)) - return true; - if (env->explore_alu_limits) - return false; + if (env->explore_alu_limits) { + /* explore_alu_limits disables tnum_in() and range_within() + * logic and requires everything to be strict + */ + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + check_scalar_ids(rold->id, rcur->id, idmap); + } if (!rold->precise) return true; - /* new val must satisfy old val knowledge */ + /* Why check_ids() for scalar registers? + * + * Consider the following BPF code: + * 1: r6 = ... unbound scalar, ID=a ... + * 2: r7 = ... unbound scalar, ID=b ... + * 3: if (r6 > r7) goto +1 + * 4: r6 = r7 + * 5: if (r6 > X) goto ... + * 6: ... memory operation using r7 ... + * + * First verification path is [1-6]: + * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7; + * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark + * r7 <= X, because r6 and r7 share same id. + * Next verification path is [1-4, 6]. + * + * Instruction (6) would be reached in two states: + * I. r6{.id=b}, r7{.id=b} via path 1-6; + * II. r6{.id=a}, r7{.id=b} via path 1-4, 6. + * + * Use check_ids() to distinguish these states. + * --- + * Also verify that new value satisfies old value range knowledge. + */ return range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); + tnum_in(rold->var_off, rcur->var_off) && + check_scalar_ids(rold->id, rcur->id, idmap); case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: case PTR_TO_MEM: @@ -14821,7 +15385,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, struct bpf_id_pair *idmap) + struct bpf_func_state *cur, struct bpf_idmap *idmap) { int i, spi; @@ -14924,7 +15488,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, } static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, - struct bpf_id_pair *idmap) + struct bpf_idmap *idmap) { int i; @@ -14972,13 +15536,13 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat for (i = 0; i < MAX_BPF_REG; i++) if (!regsafe(env, &old->regs[i], &cur->regs[i], - env->idmap_scratch)) + &env->idmap_scratch)) return false; - if (!stacksafe(env, old, cur, env->idmap_scratch)) + if (!stacksafe(env, old, cur, &env->idmap_scratch)) return false; - if (!refsafe(old, cur, env->idmap_scratch)) + if (!refsafe(old, cur, &env->idmap_scratch)) return false; return true; @@ -14993,7 +15557,8 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; - memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + env->idmap_scratch.tmp_id_gen = env->id_gen; + memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); /* Verification state from speculative execution simulation * must never prune a non-speculative execution one. @@ -15011,7 +15576,7 @@ static bool states_equal(struct bpf_verifier_env *env, return false; if (old->active_lock.id && - !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch)) + !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch)) return false; if (old->active_rcu_lock != cur->active_rcu_lock) @@ -15118,20 +15683,25 @@ static int propagate_precision(struct bpf_verifier_env *env, struct bpf_reg_state *state_reg; struct bpf_func_state *state; int i, err = 0, fr; + bool first; for (fr = old->curframe; fr >= 0; fr--) { state = old->frame[fr]; state_reg = state->regs; + first = true; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || !state_reg->precise || !(state_reg->live & REG_LIVE_READ)) continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "frame %d: propagating r%d\n", fr, i); - err = mark_chain_precision_frame(env, fr, i); - if (err < 0) - return err; + if (env->log.level & BPF_LOG_LEVEL2) { + if (first) + verbose(env, "frame %d: propagating r%d", fr, i); + else + verbose(env, ",r%d", i); + } + bt_set_frame_reg(&env->bt, fr, i); + first = false; } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { @@ -15142,14 +15712,24 @@ static int propagate_precision(struct bpf_verifier_env *env, !state_reg->precise || !(state_reg->live & REG_LIVE_READ)) continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "frame %d: propagating fp%d\n", - fr, (-i - 1) * BPF_REG_SIZE); - err = mark_chain_precision_stack_frame(env, fr, i); - if (err < 0) - return err; + if (env->log.level & BPF_LOG_LEVEL2) { + if (first) + verbose(env, "frame %d: propagating fp%d", + fr, (-i - 1) * BPF_REG_SIZE); + else + verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE); + } + bt_set_frame_slot(&env->bt, fr, i); + first = false; } + if (!first) + verbose(env, "\n"); } + + err = mark_chain_precision_batch(env); + if (err < 0) + return err; + return 0; } @@ -17033,7 +17613,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, insn->dst_reg, shift); - insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, (1ULL << size * 8) - 1); } } @@ -17214,9 +17794,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) } /* finally lock prog and jit images for all functions and - * populate kallsysm + * populate kallsysm. Begin at the first subprogram, since + * bpf_prog_load will add the kallsyms for the main program. */ - for (i = 0; i < env->subprog_cnt; i++) { + for (i = 1; i < env->subprog_cnt; i++) { bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } @@ -17242,6 +17823,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->jited_len = func[0]->jited_len; + prog->aux->extable = func[0]->aux->extable; + prog->aux->num_exentries = func[0]->aux->num_exentries; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog); @@ -18611,7 +19194,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * in the fmodret id set with the KF_SLEEPABLE flag. */ else { - u32 *flags = btf_kfunc_is_modify_return(btf, btf_id); + u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, + prog); if (flags && (*flags & KF_SLEEPABLE)) ret = 0; @@ -18639,7 +19223,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, return -EINVAL; } ret = -EINVAL; - if (btf_kfunc_is_modify_return(btf, btf_id) || + if (btf_kfunc_is_modify_return(btf, btf_id, prog) || !check_attach_modify_return(addr, tname)) ret = 0; if (ret) { @@ -18806,6 +19390,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (!env) return -ENOMEM; + env->bt.env = env; + len = (*prog)->len; env->insn_aux_data = vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len)); diff --git a/kernel/capability.c b/kernel/capability.c index 3e058f41df32..1a2795102ae4 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -467,6 +467,7 @@ EXPORT_SYMBOL(file_ns_capable); /** * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? * @ns: The user namespace in question + * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * * Return true if the inode uid and gid are within the namespace. @@ -481,6 +482,7 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped + * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * @cap: The capability in question * diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 367b0a42ada9..c56071f150f2 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -220,8 +220,6 @@ static inline void get_css_set(struct css_set *cset) bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); -bool cgroup_is_thread_root(struct cgroup *cgrp); -bool cgroup_is_threaded(struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index aeef06c465ef..83044312bc41 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -108,7 +108,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(true); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -144,7 +144,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(true); cgroup_unlock(); return ret; } @@ -563,7 +563,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(cgrp->root->release_agent_path, strstrip(buf), + strscpy(cgrp->root->release_agent_path, strstrip(buf), sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); cgroup_kn_unlock(of->kn); @@ -797,7 +797,7 @@ void cgroup1_release_agent(struct work_struct *work) goto out_free; spin_lock(&release_agent_path_lock); - strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); + strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); spin_unlock(&release_agent_path_lock); if (!agentbuf[0]) goto out_free; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 625d7483951c..bfe3cd8ccf36 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -57,6 +57,7 @@ #include <linux/file.h> #include <linux/fs_parser.h> #include <linux/sched/cputime.h> +#include <linux/sched/deadline.h> #include <linux/psi.h> #include <net/sock.h> @@ -312,8 +313,6 @@ bool cgroup_ssid_enabled(int ssid) * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical. - * - * - debug: disallowed on the default hierarchy. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { @@ -356,7 +355,7 @@ static bool cgroup_has_tasks(struct cgroup *cgrp) return cgrp->nr_populated_csets; } -bool cgroup_is_threaded(struct cgroup *cgrp) +static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } @@ -395,7 +394,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) } /* is @cgrp root of a threaded subtree? */ -bool cgroup_is_thread_root(struct cgroup *cgrp) +static bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) @@ -618,7 +617,7 @@ EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); - css_get(&cgrp->self); + cgroup_get(cgrp); } /** @@ -690,21 +689,6 @@ EXPORT_SYMBOL_GPL(of_css); else /** - * for_each_e_css - iterate all effective css's of a cgroup - * @css: the iteration cursor - * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end - * @cgrp: the target cgroup to iterate css's of - * - * Should be called under cgroup_[tree_]mutex. - */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css_by_mask(cgrp, \ - cgroup_subsys[(ssid)]))) \ - ; \ - else - -/** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end @@ -1798,7 +1782,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - int ssid, i, ret; + int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1842,7 +1826,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); - struct css_set *cset; + struct css_set *cset, *cset_pos; + struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); @@ -1860,9 +1845,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], + e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); + /* + * all css_sets of scgrp together in same order to dcgrp, + * patch in-flight iterators to preserve correct iteration. + * since the iterator is always advanced right away and + * finished when it->cset_pos meets it->cset_head, so only + * update it->cset_head is enough here. + */ + list_for_each_entry(it, &cset->task_iters, iters_node) + if (it->cset_head == &scgrp->e_csets[ss->id]) + it->cset_head = &dcgrp->e_csets[ss->id]; + } spin_unlock_irq(&css_set_lock); if (ss->css_rstat_flush) { @@ -2379,45 +2377,6 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, EXPORT_SYMBOL_GPL(cgroup_path_ns); /** - * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy - * @task: target task - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Determine @task's cgroup on the first (the one with the lowest non-zero - * hierarchy_id) cgroup hierarchy and copy its path into @buf. This - * function grabs cgroup_mutex and shouldn't be used inside locks used by - * cgroup controller callbacks. - * - * Return value is the same as kernfs_path(). - */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) -{ - struct cgroup_root *root; - struct cgroup *cgrp; - int hierarchy_id = 1; - int ret; - - cgroup_lock(); - spin_lock_irq(&css_set_lock); - - root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); - - if (root) { - cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); - } else { - /* if no hierarchy exists, everyone is in "/" */ - ret = strscpy(buf, "/", buflen); - } - - spin_unlock_irq(&css_set_lock); - cgroup_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(task_cgroup_path); - -/** * cgroup_attach_lock - Lock for ->attach() * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem * @@ -2871,9 +2830,9 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct task_struct *task; /* - * Prevent freeing of tasks while we take a snapshot. Tasks that are - * already PF_EXITING could be freed from underneath us unless we - * take an rcu_read_lock. + * The following thread iteration should be inside an RCU critical + * section to prevent tasks from being freed while taking the snapshot. + * spin_lock_irq() implies RCU critical section here. */ spin_lock_irq(&css_set_lock); task = leader; @@ -3877,6 +3836,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } +static int cgroup_pressure_open(struct kernfs_open_file *of) +{ + if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return 0; +} + static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; @@ -5276,6 +5243,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), + .open = cgroup_pressure_open, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5284,6 +5252,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), + .open = cgroup_pressure_open, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5292,6 +5261,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), + .open = cgroup_pressure_open, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, @@ -5301,6 +5271,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), + .open = cgroup_pressure_open, .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, @@ -6486,19 +6457,18 @@ err: static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + struct cgroup *cgrp = kargs->cgrp; + struct css_set *cset = kargs->cset; + cgroup_threadgroup_change_end(current); - if (kargs->flags & CLONE_INTO_CGROUP) { - struct cgroup *cgrp = kargs->cgrp; - struct css_set *cset = kargs->cset; + if (cset) { + put_css_set(cset); + kargs->cset = NULL; + } + if (kargs->flags & CLONE_INTO_CGROUP) { cgroup_unlock(); - - if (cset) { - put_css_set(cset); - kargs->cset = NULL; - } - if (cgrp) { cgroup_put(cgrp); kargs->cgrp = NULL; @@ -6683,6 +6653,9 @@ void cgroup_exit(struct task_struct *tsk) list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; + if (dl_task(tsk)) + dec_dl_tasks_cs(tsk); + WARN_ON_ONCE(cgroup_task_frozen(tsk)); if (unlikely(!(tsk->flags & PF_KTHREAD) && test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e4ca2dd2b764..58e6f18f01c1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -25,45 +25,22 @@ #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/cpuset.h> -#include <linux/err.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> -#include <linux/kmod.h> -#include <linux/kthread.h> -#include <linux/list.h> #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> #include <linux/export.h> -#include <linux/mount.h> -#include <linux/fs_context.h> -#include <linux/namei.h> -#include <linux/pagemap.h> -#include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/deadline.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> -#include <linux/seq_file.h> #include <linux/security.h> -#include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/time.h> -#include <linux/time64.h> -#include <linux/backing-dev.h> -#include <linux/sort.h> #include <linux/oom.h> #include <linux/sched/isolation.h> -#include <linux/uaccess.h> -#include <linux/atomic.h> -#include <linux/mutex.h> #include <linux/cgroup.h> #include <linux/wait.h> @@ -193,6 +170,14 @@ struct cpuset { int use_parent_ecpus; int child_ecpus_count; + /* + * number of SCHED_DEADLINE tasks attached to this cpuset, so that we + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; + int nr_migrate_dl_tasks; + u64 sum_migrate_dl_bw; + /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; @@ -245,6 +230,20 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) return css_cs(cs->css.parent); } +void inc_dl_tasks_cs(struct task_struct *p) +{ + struct cpuset *cs = task_cs(p); + + cs->nr_deadline_tasks++; +} + +void dec_dl_tasks_cs(struct task_struct *p) +{ + struct cpuset *cs = task_cs(p); + + cs->nr_deadline_tasks--; +} + /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE, @@ -366,22 +365,23 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global locks guarding cpuset structures - cpuset_rwsem and + * There are two global locks guarding cpuset structures - cpuset_mutex and * callback_lock. We also require taking task_lock() when dereferencing a * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. The cpuset code uses only cpuset_rwsem write lock. Other - * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to - * prevent change to cpuset structures. + * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems + * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset + * structures. Note that cpuset_mutex needs to be a mutex as it is used in + * paths that rely on priority inheritance (e.g. scheduler - on RT) for + * correctness. * * A task must hold both locks to modify cpusets. If a task holds - * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it - * is the only task able to also acquire callback_lock and be able to - * modify cpusets. It can perform various checks on the cpuset structure - * first, knowing nothing will change. It can also allocate memory while - * just holding cpuset_rwsem. While it is performing these checks, various - * callback routines can briefly acquire callback_lock to query cpusets. - * Once it is ready to make the changes, it takes callback_lock, blocking - * everyone else. + * cpuset_mutex, it blocks others, ensuring that it is the only task able to + * also acquire callback_lock and be able to modify cpusets. It can perform + * various checks on the cpuset structure first, knowing nothing will change. + * It can also allocate memory while just holding cpuset_mutex. While it is + * performing these checks, various callback routines can briefly acquire + * callback_lock to query cpusets. Once it is ready to make the changes, it + * takes callback_lock, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_lock, as that would risk double tripping on callback_lock @@ -403,16 +403,16 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ -DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); +static DEFINE_MUTEX(cpuset_mutex); -void cpuset_read_lock(void) +void cpuset_lock(void) { - percpu_down_read(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); } -void cpuset_read_unlock(void) +void cpuset_unlock(void) { - percpu_up_read(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } static DEFINE_SPINLOCK(callback_lock); @@ -496,7 +496,7 @@ static inline bool partition_is_populated(struct cpuset *cs, * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_lock or cpuset_rwsem held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_cpus(struct task_struct *tsk, struct cpumask *pmask) @@ -538,7 +538,7 @@ out_unlock: * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_lock or cpuset_rwsem held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -550,7 +550,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Call with callback_lock or cpuset_rwsem held. The check can be skipped + * Call with callback_lock or cpuset_mutex held. The check can be skipped * if on default hierarchy. */ static void cpuset_update_task_spread_flags(struct cpuset *cs, @@ -575,7 +575,7 @@ static void cpuset_update_task_spread_flags(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_rwsem. + * are only set if the other's are set. Call holding cpuset_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -713,7 +713,7 @@ out: * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_rwsem held. + * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -829,7 +829,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } -/* Must be called with cpuset_rwsem held. */ +/* Must be called with cpuset_mutex held. */ static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ @@ -855,7 +855,7 @@ static inline int nr_cpusets(void) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cpuset_rwsem held. + * Must be called with cpuset_mutex held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a @@ -1066,11 +1066,14 @@ done: return ndoms; } -static void update_tasks_root_domain(struct cpuset *cs) +static void dl_update_tasks_root_domain(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; + if (cs->nr_deadline_tasks == 0) + return; + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) @@ -1079,12 +1082,12 @@ static void update_tasks_root_domain(struct cpuset *cs) css_task_iter_end(&it); } -static void rebuild_root_domains(void) +static void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -1107,7 +1110,7 @@ static void rebuild_root_domains(void) rcu_read_unlock(); - update_tasks_root_domain(cs); + dl_update_tasks_root_domain(cs); rcu_read_lock(); css_put(&cs->css); @@ -1121,7 +1124,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], { mutex_lock(&sched_domains_mutex); partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - rebuild_root_domains(); + dl_rebuild_rd_accounting(); mutex_unlock(&sched_domains_mutex); } @@ -1134,7 +1137,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_rwsem held. Takes cpus_read_lock(). + * Call with cpuset_mutex held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { @@ -1145,7 +1148,7 @@ static void rebuild_sched_domains_locked(void) int ndoms; lockdep_assert_cpus_held(); - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * If we have raced with CPU hotplug, return early to avoid @@ -1196,9 +1199,9 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); } @@ -1208,7 +1211,7 @@ void rebuild_sched_domains(void) * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the - * effective cpuset's. As this function is called with cpuset_rwsem held, + * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() * is used instead of effective_cpus to make sure all offline CPUs are also * included as hotplug code won't update cpumasks for tasks in top_cpuset. @@ -1322,7 +1325,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * The parent must be a partition root. @@ -1545,7 +1548,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * - * Called with cpuset_rwsem held + * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, bool force) @@ -1705,7 +1708,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * Check all its siblings and call update_cpumasks_hier() @@ -1955,12 +1958,12 @@ static void *cpuset_being_rebound; * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the - * effective cpuset's. As this function is called with cpuset_rwsem held, + * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ static void update_tasks_nodemask(struct cpuset *cs) { - static nodemask_t newmems; /* protected by cpuset_rwsem */ + static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; struct task_struct *task; @@ -1973,7 +1976,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cpuset_rwsem, we know that no other rebind effort + * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -2019,7 +2022,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * - * Called with cpuset_rwsem held + * Called with cpuset_mutex held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { @@ -2072,7 +2075,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_rwsem held. May take callback_lock during call. + * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -2164,7 +2167,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * @cs: the cpuset in which each task's spread flags needs to be changed * * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_rwsem held, cpuset membership stays + * function is called with cpuset_mutex held, cpuset membership stays * stable. */ static void update_tasks_flags(struct cpuset *cs) @@ -2184,7 +2187,7 @@ static void update_tasks_flags(struct cpuset *cs) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cpuset_rwsem held. + * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -2234,7 +2237,7 @@ out: * @new_prs: new partition root state * Return: 0 if successful, != 0 if error * - * Call with cpuset_rwsem held. + * Call with cpuset_mutex held. */ static int update_prstate(struct cpuset *cs, int new_prs) { @@ -2472,19 +2475,26 @@ static int cpuset_can_attach_check(struct cpuset *cs) return 0; } -/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ +static void reset_migrate_dl_data(struct cpuset *cs) +{ + cs->nr_migrate_dl_tasks = 0; + cs->sum_migrate_dl_bw = 0; +} + +/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; - struct cpuset *cs; + struct cpuset *cs, *oldcs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + oldcs = cpuset_attach_old_cs; cs = css_cs(css); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs); @@ -2492,21 +2502,46 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) goto out_unlock; cgroup_taskset_for_each(task, css, tset) { - ret = task_can_attach(task, cs->effective_cpus); + ret = task_can_attach(task); if (ret) goto out_unlock; ret = security_task_setscheduler(task); if (ret) goto out_unlock; + + if (dl_task(task)) { + cs->nr_migrate_dl_tasks++; + cs->sum_migrate_dl_bw += task->dl.dl_bw; + } } + if (!cs->nr_migrate_dl_tasks) + goto out_success; + + if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { + int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + + if (unlikely(cpu >= nr_cpu_ids)) { + reset_migrate_dl_data(cs); + ret = -EINVAL; + goto out_unlock; + } + + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) { + reset_migrate_dl_data(cs); + goto out_unlock; + } + } + +out_success: /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); return ret; } @@ -2518,15 +2553,23 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + + if (cs->nr_migrate_dl_tasks) { + int cpu = cpumask_any(cs->effective_cpus); + + dl_bw_free(cpu, cs->sum_migrate_dl_bw); + reset_migrate_dl_data(cs); + } + + mutex_unlock(&cpuset_mutex); } /* - * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task() + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ @@ -2535,7 +2578,7 @@ static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); if (cs != &top_cpuset) guarantee_online_cpus(task, cpus_attach); @@ -2565,7 +2608,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cs = css_cs(css); lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); @@ -2622,11 +2665,17 @@ static void cpuset_attach(struct cgroup_taskset *tset) out: cs->old_mems_allowed = cpuset_attach_nodemask_to; + if (cs->nr_migrate_dl_tasks) { + cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; + oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; + reset_migrate_dl_data(cs); + } + cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* The various types of files and directories in a cpuset file system */ @@ -2658,7 +2707,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, int retval = 0; cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -2694,7 +2743,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); return retval; } @@ -2707,7 +2756,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, int retval = -ENODEV; cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2720,7 +2769,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); return retval; } @@ -2753,7 +2802,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after - * grabbing cpuset_rwsem anyway. This only happens on the legacy + * grabbing cpuset_mutex anyway. This only happens on the legacy * hierarchies. */ css_get(&cs->css); @@ -2761,7 +2810,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, flush_work(&cpuset_hotplug_work); cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2785,7 +2834,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); @@ -2933,13 +2982,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, css_get(&cs->css); cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; retval = update_prstate(cs, val); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); css_put(&cs->css); return retval ?: nbytes; @@ -3156,7 +3205,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) return 0; cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) @@ -3207,7 +3256,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); return 0; } @@ -3228,7 +3277,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) struct cpuset *cs = css_cs(css); cpus_read_lock(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (is_partition_valid(cs)) update_prstate(cs, 0); @@ -3247,7 +3296,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); cpus_read_unlock(); } @@ -3260,7 +3309,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { @@ -3273,7 +3322,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) } spin_unlock_irq(&callback_lock); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* @@ -3294,14 +3343,14 @@ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) return 0; lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* Check to see if task is allowed in the cpuset */ ret = cpuset_can_attach_check(cs); if (ret) goto out_unlock; - ret = task_can_attach(task, cs->effective_cpus); + ret = task_can_attach(task); if (ret) goto out_unlock; @@ -3315,7 +3364,7 @@ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) */ cs->attach_in_progress++; out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); return ret; } @@ -3331,11 +3380,11 @@ static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) if (same_cs) return; - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* @@ -3363,7 +3412,7 @@ static void cpuset_fork(struct task_struct *task) } /* CLONE_INTO_CGROUP */ - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); @@ -3371,7 +3420,7 @@ static void cpuset_fork(struct task_struct *task) if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } struct cgroup_subsys cpuset_cgrp_subsys = { @@ -3472,7 +3521,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); /* * Move tasks to the nearest ancestor with execution resources, @@ -3482,7 +3531,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (is_empty) remove_tasks_in_empty_cpuset(cs); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); } static void @@ -3533,14 +3582,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); goto retry; } @@ -3637,7 +3686,7 @@ update_tasks: cpus_updated, mems_updated); unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /** @@ -3667,7 +3716,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); @@ -3724,7 +3773,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(&top_cpuset); } - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { @@ -4155,7 +4204,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_rwsem, keeping cpuset_attach() from changing it + * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 936473203a6b..122dacb3a443 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -108,16 +108,18 @@ static int freezer_css_online(struct cgroup_subsys_state *css) struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); + cpus_read_lock(); mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - static_branch_inc(&freezer_active); + static_branch_inc_cpuslocked(&freezer_active); } mutex_unlock(&freezer_mutex); + cpus_read_unlock(); return 0; } @@ -132,14 +134,16 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); + cpus_read_lock(); mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) - static_branch_dec(&freezer_active); + static_branch_dec_cpuslocked(&freezer_active); freezer->state = 0; mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static void freezer_css_free(struct cgroup_subsys_state *css) diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index fe3e8a0eb7ed..ae2f4dd47508 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -357,7 +357,6 @@ static struct cftype misc_cg_files[] = { { .name = "current", .seq_show = misc_cg_current_show, - .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "capacity", diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 3135406608c7..ef5878fb2005 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -197,6 +197,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg, /** * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count + * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup * stop uncharging @@ -221,6 +222,7 @@ static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, /** * rdmacg_uncharge - hierarchically uncharge rdma resource count + * @cg: pointer to cg to uncharge and all parents in hierarchy * @device: pointer to rdmacg device * @index: index of the resource to uncharge in cgroup in given resource pool */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 9c4c55228567..2542c21b6b6d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -171,7 +171,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __diag_pop(); /* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) +static void cgroup_rstat_flush_locked(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) { int cpu; @@ -207,9 +207,8 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) } raw_spin_unlock_irqrestore(cpu_lock, flags); - /* if @may_sleep, play nice and yield if necessary */ - if (may_sleep && (need_resched() || - spin_needbreak(&cgroup_rstat_lock))) { + /* play nice and yield if necessary */ + if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { spin_unlock_irq(&cgroup_rstat_lock); if (!cond_resched()) cpu_relax(); @@ -236,26 +235,11 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) might_sleep(); spin_lock_irq(&cgroup_rstat_lock); - cgroup_rstat_flush_locked(cgrp, true); + cgroup_rstat_flush_locked(cgrp); spin_unlock_irq(&cgroup_rstat_lock); } /** - * cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush() - * @cgrp: target cgroup - * - * This function can be called from any context. - */ -void cgroup_rstat_flush_atomic(struct cgroup *cgrp) -{ - unsigned long flags; - - spin_lock_irqsave(&cgroup_rstat_lock, flags); - cgroup_rstat_flush_locked(cgrp, false); - spin_unlock_irqrestore(&cgroup_rstat_lock, flags); -} - -/** * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold * @cgrp: target cgroup * @@ -269,7 +253,7 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) { might_sleep(); spin_lock_irq(&cgroup_rstat_lock); - cgroup_rstat_flush_locked(cgrp, true); + cgroup_rstat_flush_locked(cgrp); } /** diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 144b2bd86b14..00009f7d0835 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -6,6 +6,5 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set -# CONFIG_SLAB is not set CONFIG_SLUB=y CONFIG_SLUB_TINY=y diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index a09f1c19336a..6ef0b35fc28c 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - arch_atomic_set(&ct->state, state); + raw_atomic_set(&ct->state, state); } else { /* * Even if context tracking is disabled on this CPU, because it's outside @@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state) */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - arch_atomic_set(&ct->state, state); + raw_atomic_set(&ct->state, state); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - arch_atomic_add(state, &ct->state); + raw_atomic_add(state, &ct->state); } } } @@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - arch_atomic_set(&ct->state, CONTEXT_KERNEL); + raw_atomic_set(&ct->state, CONTEXT_KERNEL); } else { if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - arch_atomic_set(&ct->state, CONTEXT_KERNEL); + raw_atomic_set(&ct->state, CONTEXT_KERNEL); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - arch_atomic_sub(state, &ct->state); + raw_atomic_sub(state, &ct->state); } } } diff --git a/kernel/cpu.c b/kernel/cpu.c index f4a2c5845bcb..88a7ede322bd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -17,6 +17,7 @@ #include <linux/cpu.h> #include <linux/oom.h> #include <linux/rcupdate.h> +#include <linux/delay.h> #include <linux/export.h> #include <linux/bug.h> #include <linux/kthread.h> @@ -59,6 +60,7 @@ * @last: For multi-instance rollback, remember how far we got * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation + * @ap_sync_state: State for AP synchronization * @done_up: Signal completion to the issuer of the task for cpu-up * @done_down: Signal completion to the issuer of the task for cpu-down */ @@ -76,6 +78,7 @@ struct cpuhp_cpu_state { struct hlist_node *last; enum cpuhp_state cb_state; int result; + atomic_t ap_sync_state; struct completion done_up; struct completion done_down; #endif @@ -276,6 +279,182 @@ static bool cpuhp_is_atomic_state(enum cpuhp_state state) return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; } +/* Synchronization state management */ +enum cpuhp_sync_state { + SYNC_STATE_DEAD, + SYNC_STATE_KICKED, + SYNC_STATE_SHOULD_DIE, + SYNC_STATE_ALIVE, + SYNC_STATE_SHOULD_ONLINE, + SYNC_STATE_ONLINE, +}; + +#ifdef CONFIG_HOTPLUG_CORE_SYNC +/** + * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown + * @state: The synchronization state to set + * + * No synchronization point. Just update of the synchronization state, but implies + * a full barrier so that the AP changes are visible before the control CPU proceeds. + */ +static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) +{ + atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state); + + (void)atomic_xchg(st, state); +} + +void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); } + +static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state, + enum cpuhp_sync_state next_state) +{ + atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); + ktime_t now, end, start = ktime_get(); + int sync; + + end = start + 10ULL * NSEC_PER_SEC; + + sync = atomic_read(st); + while (1) { + if (sync == state) { + if (!atomic_try_cmpxchg(st, &sync, next_state)) + continue; + return true; + } + + now = ktime_get(); + if (now > end) { + /* Timeout. Leave the state unchanged */ + return false; + } else if (now - start < NSEC_PER_MSEC) { + /* Poll for one millisecond */ + arch_cpuhp_sync_state_poll(); + } else { + usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE); + } + sync = atomic_read(st); + } + return true; +} +#else /* CONFIG_HOTPLUG_CORE_SYNC */ +static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { } +#endif /* !CONFIG_HOTPLUG_CORE_SYNC */ + +#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD +/** + * cpuhp_ap_report_dead - Update synchronization state to DEAD + * + * No synchronization point. Just update of the synchronization state. + */ +void cpuhp_ap_report_dead(void) +{ + cpuhp_ap_update_sync_state(SYNC_STATE_DEAD); +} + +void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { } + +/* + * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down + * because the AP cannot issue complete() at this stage. + */ +static void cpuhp_bp_sync_dead(unsigned int cpu) +{ + atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); + int sync = atomic_read(st); + + do { + /* CPU can have reported dead already. Don't overwrite that! */ + if (sync == SYNC_STATE_DEAD) + break; + } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE)); + + if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) { + /* CPU reached dead state. Invoke the cleanup function */ + arch_cpuhp_cleanup_dead_cpu(cpu); + return; + } + + /* No further action possible. Emit message and give up. */ + pr_err("CPU%u failed to report dead state\n", cpu); +} +#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */ +static inline void cpuhp_bp_sync_dead(unsigned int cpu) { } +#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */ + +#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL +/** + * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive + * + * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits + * for the BP to release it. + */ +void cpuhp_ap_sync_alive(void) +{ + atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state); + + cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE); + + /* Wait for the control CPU to release it. */ + while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE) + cpu_relax(); +} + +static bool cpuhp_can_boot_ap(unsigned int cpu) +{ + atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu); + int sync = atomic_read(st); + +again: + switch (sync) { + case SYNC_STATE_DEAD: + /* CPU is properly dead */ + break; + case SYNC_STATE_KICKED: + /* CPU did not come up in previous attempt */ + break; + case SYNC_STATE_ALIVE: + /* CPU is stuck cpuhp_ap_sync_alive(). */ + break; + default: + /* CPU failed to report online or dead and is in limbo state. */ + return false; + } + + /* Prepare for booting */ + if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED)) + goto again; + + return true; +} + +void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { } + +/* + * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up + * because the AP cannot issue complete() so early in the bringup. + */ +static int cpuhp_bp_sync_alive(unsigned int cpu) +{ + int ret = 0; + + if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL)) + return 0; + + if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) { + pr_err("CPU%u failed to report alive state\n", cpu); + ret = -EIO; + } + + /* Let the architecture cleanup the kick alive mechanics. */ + arch_cpuhp_cleanup_kick_cpu(cpu); + return ret; +} +#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */ +static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; } +static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; } +#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */ + /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); bool cpuhp_tasks_frozen; @@ -470,8 +649,23 @@ bool cpu_smt_possible(void) cpu_smt_control != CPU_SMT_NOT_SUPPORTED; } EXPORT_SYMBOL_GPL(cpu_smt_possible); + +static inline bool cpuhp_smt_aware(void) +{ + return topology_smt_supported(); +} + +static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) +{ + return cpu_primary_thread_mask; +} #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } +static inline bool cpuhp_smt_aware(void) { return false; } +static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) +{ + return cpu_present_mask; +} #endif static inline enum cpuhp_state @@ -558,7 +752,7 @@ static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st, return ret; } -static int bringup_wait_for_ap(unsigned int cpu) +static int bringup_wait_for_ap_online(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -579,38 +773,94 @@ static int bringup_wait_for_ap(unsigned int cpu) */ if (!cpu_smt_allowed(cpu)) return -ECANCELED; + return 0; +} + +#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP +static int cpuhp_kick_ap_alive(unsigned int cpu) +{ + if (!cpuhp_can_boot_ap(cpu)) + return -EAGAIN; + + return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu)); +} + +static int cpuhp_bringup_ap(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret; + + /* + * Some architectures have to walk the irq descriptors to + * setup the vector space for the cpu which comes online. + * Prevent irq alloc/free across the bringup. + */ + irq_lock_sparse(); + + ret = cpuhp_bp_sync_alive(cpu); + if (ret) + goto out_unlock; + + ret = bringup_wait_for_ap_online(cpu); + if (ret) + goto out_unlock; + + irq_unlock_sparse(); if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; return cpuhp_kick_ap(cpu, st, st->target); -} +out_unlock: + irq_unlock_sparse(); + return ret; +} +#else static int bringup_cpu(unsigned int cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle = idle_thread_get(cpu); int ret; - /* - * Reset stale stack state from the last time this CPU was online. - */ - scs_task_reset(idle); - kasan_unpoison_task_stack(idle); + if (!cpuhp_can_boot_ap(cpu)) + return -EAGAIN; /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. - * Prevent irq alloc/free across the bringup. + * + * Prevent irq alloc/free across the bringup by acquiring the + * sparse irq lock. Hold it until the upcoming CPU completes the + * startup in cpuhp_online_idle() which allows to avoid + * intermediate synchronization points in the architecture code. */ irq_lock_sparse(); - /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); - irq_unlock_sparse(); if (ret) - return ret; - return bringup_wait_for_ap(cpu); + goto out_unlock; + + ret = cpuhp_bp_sync_alive(cpu); + if (ret) + goto out_unlock; + + ret = bringup_wait_for_ap_online(cpu); + if (ret) + goto out_unlock; + + irq_unlock_sparse(); + + if (st->target <= CPUHP_AP_ONLINE_IDLE) + return 0; + + return cpuhp_kick_ap(cpu, st, st->target); + +out_unlock: + irq_unlock_sparse(); + return ret; } +#endif static int finish_cpu(unsigned int cpu) { @@ -1099,6 +1349,8 @@ static int takedown_cpu(unsigned int cpu) /* This actually kills the CPU. */ __cpu_die(cpu); + cpuhp_bp_sync_dead(cpu); + tick_cleanup_dead_cpu(cpu); rcutree_migrate_callbacks(cpu); return 0; @@ -1345,8 +1597,10 @@ void cpuhp_online_idle(enum cpuhp_state state) if (state != CPUHP_AP_ONLINE_IDLE) return; + cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE); + /* - * Unpart the stopper thread before we start the idle loop (and start + * Unpark the stopper thread before we start the idle loop (and start * scheduling); this ensures the stopper task is always available. */ stop_machine_unpark(smp_processor_id()); @@ -1383,6 +1637,12 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) ret = PTR_ERR(idle); goto out; } + + /* + * Reset stale stack state from the last time this CPU was online. + */ + scs_task_reset(idle); + kasan_unpoison_task_stack(idle); } cpuhp_tasks_frozen = tasks_frozen; @@ -1502,18 +1762,96 @@ int bringup_hibernate_cpu(unsigned int sleep_cpu) return 0; } -void bringup_nonboot_cpus(unsigned int setup_max_cpus) +static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus, + enum cpuhp_state target) { unsigned int cpu; - for_each_present_cpu(cpu) { - if (num_online_cpus() >= setup_max_cpus) + for_each_cpu(cpu, mask) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + if (cpu_up(cpu, target) && can_rollback_cpu(st)) { + /* + * If this failed then cpu_up() might have only + * rolled back to CPUHP_BP_KICK_AP for the final + * online. Clean it up. NOOP if already rolled back. + */ + WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE)); + } + + if (!--ncpus) break; - if (!cpu_online(cpu)) - cpu_up(cpu, CPUHP_ONLINE); } } +#ifdef CONFIG_HOTPLUG_PARALLEL +static bool __cpuhp_parallel_bringup __ro_after_init = true; + +static int __init parallel_bringup_parse_param(char *arg) +{ + return kstrtobool(arg, &__cpuhp_parallel_bringup); +} +early_param("cpuhp.parallel", parallel_bringup_parse_param); + +/* + * On architectures which have enabled parallel bringup this invokes all BP + * prepare states for each of the to be onlined APs first. The last state + * sends the startup IPI to the APs. The APs proceed through the low level + * bringup code in parallel and then wait for the control CPU to release + * them one by one for the final onlining procedure. + * + * This avoids waiting for each AP to respond to the startup IPI in + * CPUHP_BRINGUP_CPU. + */ +static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus) +{ + const struct cpumask *mask = cpu_present_mask; + + if (__cpuhp_parallel_bringup) + __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup(); + if (!__cpuhp_parallel_bringup) + return false; + + if (cpuhp_smt_aware()) { + const struct cpumask *pmask = cpuhp_get_primary_thread_mask(); + static struct cpumask tmp_mask __initdata; + + /* + * X86 requires to prevent that SMT siblings stopped while + * the primary thread does a microcode update for various + * reasons. Bring the primary threads up first. + */ + cpumask_and(&tmp_mask, mask, pmask); + cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP); + cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE); + /* Account for the online CPUs */ + ncpus -= num_online_cpus(); + if (!ncpus) + return true; + /* Create the mask for secondary CPUs */ + cpumask_andnot(&tmp_mask, mask, pmask); + mask = &tmp_mask; + } + + /* Bring the not-yet started CPUs up */ + cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP); + cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE); + return true; +} +#else +static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; } +#endif /* CONFIG_HOTPLUG_PARALLEL */ + +void __init bringup_nonboot_cpus(unsigned int setup_max_cpus) +{ + /* Try parallel bringup optimization if enabled */ + if (cpuhp_bringup_cpus_parallel(setup_max_cpus)) + return; + + /* Full per CPU serialized bringup */ + cpuhp_bringup_mask(cpu_present_mask, setup_max_cpus, CPUHP_ONLINE); +} + #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; @@ -1740,13 +2078,38 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, - /* Kicks the plugged cpu into life */ + +#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP + /* + * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until + * the next step will release it. + */ + [CPUHP_BP_KICK_AP] = { + .name = "cpu:kick_ap", + .startup.single = cpuhp_kick_ap_alive, + }, + + /* + * Waits for the AP to reach cpuhp_ap_sync_alive() and then + * releases it for the complete bringup. + */ + [CPUHP_BRINGUP_CPU] = { + .name = "cpu:bringup", + .startup.single = cpuhp_bringup_ap, + .teardown.single = finish_cpu, + .cant_stop = true, + }, +#else + /* + * All-in-one CPU bringup state which includes the kick alive. + */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, .teardown.single = finish_cpu, .cant_stop = true, }, +#endif /* Final state before CPU kills itself */ [CPUHP_AP_IDLE_DEAD] = { .name = "idle:dead", @@ -2723,6 +3086,7 @@ void __init boot_cpu_hotplug_init(void) { #ifdef CONFIG_SMP cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask); + atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE); #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); this_cpu_write(cpuhp_state.target, CPUHP_ONLINE); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 6677d0e64d27..11d077003205 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -24,6 +24,9 @@ config DMA_OPS_BYPASS config ARCH_HAS_DMA_MAP_DIRECT bool +config NEED_SG_DMA_FLAGS + bool + config NEED_SG_DMA_LENGTH bool @@ -39,7 +42,7 @@ config ARCH_HAS_DMA_SET_MASK # # Select this option if the architecture needs special handling for # DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what -# people thing of when saying write combine, so very few platforms should +# people think of when saying write combine, so very few platforms should # need to enable this. # config ARCH_HAS_DMA_WRITE_COMBINE @@ -87,6 +90,10 @@ config SWIOTLB bool select NEED_DMA_MAP_STATE +config DMA_BOUNCE_UNALIGNED_KMALLOC + bool + depends on SWIOTLB + config DMA_RESTRICTED_POOL bool "DMA Restricted Pool" depends on OF && OF_RESERVED_MEM && SWIOTLB diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 5595d1d5cdcc..d29cade048db 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -463,7 +463,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int i; for_each_sg(sgl, sg, nents, i) { - if (sg_is_dma_bus_address(sg)) + if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else dma_direct_unmap_page(dev, sg->dma_address, diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index e38ffc5e6bdd..97ec892ea0b5 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -94,7 +94,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, return swiotlb_map(dev, phys, size, dir, attrs); } - if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (unlikely(!dma_capable(dev, dma_addr, size, true)) || + dma_kmalloc_needs_bounce(dev, size, dir)) { if (is_pci_p2pdma_page(page)) return DMA_MAPPING_ERROR; if (is_swiotlb_active(dev)) diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index b4526668072e..27596f3b4aef 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -43,13 +43,13 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, void *vaddr; int i; - pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL); + pages = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!pages) return NULL; for (i = 0; i < count; i++) pages[i] = nth_page(page, i); vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); - kfree(pages); + kvfree(pages); return vaddr; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index af2e304c672c..775f7bb10ab1 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -717,6 +717,15 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, return -1; } +#ifdef CONFIG_DEBUG_FS + +static unsigned long mem_used(struct io_tlb_mem *mem) +{ + return atomic_long_read(&mem->total_used); +} + +#else /* !CONFIG_DEBUG_FS */ + static unsigned long mem_used(struct io_tlb_mem *mem) { int i; @@ -727,6 +736,8 @@ static unsigned long mem_used(struct io_tlb_mem *mem) return used; } +#endif /* CONFIG_DEBUG_FS */ + phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, unsigned int alloc_align_mask, enum dma_data_direction dir, diff --git a/kernel/events/core.c b/kernel/events/core.c index 68baa8194d9f..3060427f6c9e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6647,7 +6647,7 @@ static void perf_sigtrap(struct perf_event *event) return; send_sig_perf((void __user *)event->pending_addr, - event->attr.type, event->attr.sig_data); + event->orig_type, event->attr.sig_data); } /* @@ -7490,6 +7490,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) return pud_leaf_size(pud); pmdp = pmd_offset_lockless(pudp, pud, addr); +again: pmd = pmdp_get_lockless(pmdp); if (!pmd_present(pmd)) return 0; @@ -7498,6 +7499,9 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) return pmd_leaf_size(pmd); ptep = pte_offset_map(&pmd, addr); + if (!ptep) + goto again; + pte = ptep_get_lockless(ptep); if (pte_present(pte)) size = pte_leaf_size(pte); @@ -9951,6 +9955,9 @@ static void sw_perf_event_destroy(struct perf_event *event) swevent_hlist_put(); } +static struct pmu perf_cpu_clock; /* fwd declaration */ +static struct pmu perf_task_clock; + static int perf_swevent_init(struct perf_event *event) { u64 event_id = event->attr.config; @@ -9966,7 +9973,10 @@ static int perf_swevent_init(struct perf_event *event) switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: + event->attr.type = perf_cpu_clock.type; + return -ENOENT; case PERF_COUNT_SW_TASK_CLOCK: + event->attr.type = perf_task_clock.type; return -ENOENT; default: @@ -10150,8 +10160,20 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) + if (perf_tp_event_match(event, &data, regs)) { perf_swevent_event(event, count, &data, regs); + + /* + * Here use the same on-stack perf_sample_data, + * some members in data are event-specific and + * need to be re-computed for different sweveents. + * Re-initialize data->sample_flags safely to avoid + * the problem that next event skips preparing data + * because data->sample_flags is set. + */ + perf_sample_data_init(&data, 0, 0); + perf_sample_save_raw_data(&data, &raw); + } } /* @@ -11086,7 +11108,7 @@ static void cpu_clock_event_read(struct perf_event *event) static int cpu_clock_event_init(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_SOFTWARE) + if (event->attr.type != perf_cpu_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) @@ -11107,6 +11129,7 @@ static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, + .dev = PMU_NULL_DEV, .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, @@ -11167,7 +11190,7 @@ static void task_clock_event_read(struct perf_event *event) static int task_clock_event_init(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_SOFTWARE) + if (event->attr.type != perf_task_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) @@ -11188,6 +11211,7 @@ static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, + .dev = PMU_NULL_DEV, .event_init = task_clock_event_init, .add = task_clock_event_add, @@ -11415,31 +11439,31 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto unlock; pmu->type = -1; - if (!name) - goto skip_type; + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { + ret = -EINVAL; + goto free_pdc; + } + pmu->name = name; - if (type != PERF_TYPE_SOFTWARE) { - if (type >= 0) - max = type; + if (type >= 0) + max = type; - ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); - if (ret < 0) - goto free_pdc; + ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); + if (ret < 0) + goto free_pdc; - WARN_ON(type >= 0 && ret != type); + WARN_ON(type >= 0 && ret != type); - type = ret; - } + type = ret; pmu->type = type; - if (pmu_bus_running) { + if (pmu_bus_running && !pmu->dev) { ret = pmu_dev_alloc(pmu); if (ret) goto free_idr; } -skip_type: ret = -ENOMEM; pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); if (!pmu->cpu_pmu_context) @@ -11481,16 +11505,7 @@ skip_type: if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; - /* - * Ensure the TYPE_SOFTWARE PMUs are at the head of the list, - * since these cannot be in the IDR. This way the linear search - * is fast, provided a valid software event is provided. - */ - if (type == PERF_TYPE_SOFTWARE || !name) - list_add_rcu(&pmu->entry, &pmus); - else - list_add_tail_rcu(&pmu->entry, &pmus); - + list_add_rcu(&pmu->entry, &pmus); atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: @@ -11499,12 +11514,13 @@ unlock: return ret; free_dev: - device_del(pmu->dev); - put_device(pmu->dev); + if (pmu->dev && pmu->dev != PMU_NULL_DEV) { + device_del(pmu->dev); + put_device(pmu->dev); + } free_idr: - if (pmu->type != PERF_TYPE_SOFTWARE) - idr_remove(&pmu_idr, pmu->type); + idr_remove(&pmu_idr, pmu->type); free_pdc: free_percpu(pmu->pmu_disable_count); @@ -11525,9 +11541,8 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); - if (pmu->type != PERF_TYPE_SOFTWARE) - idr_remove(&pmu_idr, pmu->type); - if (pmu_bus_running) { + idr_remove(&pmu_idr, pmu->type); + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); @@ -11601,6 +11616,12 @@ static struct pmu *perf_init_event(struct perf_event *event) idx = srcu_read_lock(&pmus_srcu); + /* + * Save original type before calling pmu->event_init() since certain + * pmus overwrites event->attr.type to forward event to another pmu. + */ + event->orig_type = event->attr.type; + /* Try parent's PMU first: */ if (event->parent && event->parent->pmu) { pmu = event->parent->pmu; @@ -13640,8 +13661,8 @@ void __init perf_event_init(void) perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); - perf_pmu_register(&perf_cpu_clock, NULL, -1); - perf_pmu_register(&perf_task_clock, NULL, -1); + perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1); + perf_pmu_register(&perf_task_clock, "task_clock", -1); perf_tp_register(); perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); @@ -13684,7 +13705,7 @@ static int __init perf_event_sysfs_init(void) goto unlock; list_for_each_entry(pmu, &pmus, entry) { - if (!pmu->name || pmu->type < 0) + if (pmu->dev) continue; ret = pmu_dev_alloc(pmu); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 59887c69d54c..f0ac5b874919 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); ptep_clear_flush_notify(vma, addr, pvmw.pte); if (new_page) set_pte_at_notify(mm, addr, pvmw.pte, @@ -365,7 +365,6 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) { void *kaddr; struct page *page; - struct vm_area_struct *vma; int ret; short *ptr; @@ -373,7 +372,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) return -EINVAL; ret = get_user_pages_remote(mm, vaddr, 1, - FOLL_WRITE, &page, &vma, NULL); + FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, @@ -474,10 +473,9 @@ retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, - &old_page, &vma, NULL); - if (ret <= 0) - return ret; + old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR_OR_NULL(old_page)) + return old_page ? PTR_ERR(old_page) : 0; ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) @@ -2027,8 +2025,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, - NULL, NULL); + result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL); if (result < 0) return result; diff --git a/kernel/exit.c b/kernel/exit.c index 34b90e2e7cf7..edb50b4c9972 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -411,7 +411,10 @@ static void coredump_task_exit(struct task_struct *tsk) tsk->flags |= PF_POSTCOREDUMP; core_state = tsk->signal->core_state; spin_unlock_irq(&tsk->sighand->siglock); - if (core_state) { + + /* The vhost_worker does not particpate in coredumps */ + if (core_state && + ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) { struct core_thread self; self.task = current; diff --git a/kernel/fork.c b/kernel/fork.c index ed4e01daccaa..b85814e614a5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -252,23 +252,19 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) { int i; int ret; + int nr_charged = 0; - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); if (ret) goto err; + nr_charged++; } return 0; err: - /* - * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is - * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will - * ignore this page. - */ - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + for (i = 0; i < nr_charged; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); return ret; } @@ -627,6 +623,7 @@ void free_task(struct task_struct *tsk) arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); + bpf_task_storage_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -979,7 +976,6 @@ void __put_task_struct(struct task_struct *tsk) cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); - bpf_task_storage_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); @@ -2336,16 +2332,16 @@ __latent_entropy struct task_struct *copy_process( p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; - if (args->user_worker) - p->flags |= PF_USER_WORKER; - if (args->io_thread) { + if (args->user_worker) { /* - * Mark us an IO worker, and block any signal that isn't + * Mark us a user worker, and block any signal that isn't * fatal or STOP */ - p->flags |= PF_IO_WORKER; + p->flags |= PF_USER_WORKER; siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } + if (args->io_thread) + p->flags |= PF_IO_WORKER; if (args->name) strscpy_pad(p->comm, args->name, sizeof(p->comm)); @@ -2517,9 +2513,6 @@ __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; - if (args->ignore_signals) - ignore_signals(p); - stackleak_task_init(p); if (pid != &init_struct_pid) { diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 49e7bc871fec..ee8c0acf39df 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -306,6 +306,7 @@ static void __irq_disable(struct irq_desc *desc, bool mask); void irq_shutdown(struct irq_desc *desc) { if (irqd_is_started(&desc->irq_data)) { + clear_irq_resend(desc); desc->depth = 1; if (desc->irq_data.chip->irq_shutdown) { desc->irq_data.chip->irq_shutdown(&desc->irq_data); @@ -692,8 +693,16 @@ void handle_fasteoi_irq(struct irq_desc *desc) raw_spin_lock(&desc->lock); - if (!irq_may_run(desc)) + /* + * When an affinity change races with IRQ handling, the next interrupt + * can arrive on the new CPU before the original CPU has completed + * handling the previous one - it may need to be resent. + */ + if (!irq_may_run(desc)) { + if (irqd_needs_resend_when_in_progress(&desc->irq_data)) + desc->istate |= IRQS_PENDING; goto out; + } desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); @@ -715,6 +724,12 @@ void handle_fasteoi_irq(struct irq_desc *desc) cond_unmask_eoi_irq(desc, chip); + /* + * When the race described above happens this will resend the interrupt. + */ + if (unlikely(desc->istate & IRQS_PENDING)) + check_irq_resend(desc, false); + raw_spin_unlock(&desc->lock); return; out: diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index bbcaac64038e..5971a66be034 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -133,6 +133,8 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND), + + BIT_MASK_DESCR(IRQD_RESEND_WHEN_IN_PROGRESS), }; static const struct irq_bit_descr irqdesc_states[] = { diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5fdc0b557579..bdd35bb9c735 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -12,9 +12,9 @@ #include <linux/sched/clock.h> #ifdef CONFIG_SPARSE_IRQ -# define IRQ_BITMAP_BITS (NR_IRQS + 8196) +# define MAX_SPARSE_IRQS INT_MAX #else -# define IRQ_BITMAP_BITS NR_IRQS +# define MAX_SPARSE_IRQS NR_IRQS #endif #define istate core_internal_state__do_not_mess_with_it @@ -47,9 +47,12 @@ enum { * detection * IRQS_POLL_INPROGRESS - polling in progress * IRQS_ONESHOT - irq is not unmasked in primary handler - * IRQS_REPLAY - irq is replayed + * IRQS_REPLAY - irq has been resent and will not be resent + * again until the handler has run and cleared + * this flag. * IRQS_WAITING - irq is waiting - * IRQS_PENDING - irq is pending and replayed later + * IRQS_PENDING - irq needs to be resent and should be resent + * at the next available opportunity. * IRQS_SUSPENDED - irq is suspended * IRQS_NMI - irq line is used to deliver NMIs * IRQS_SYSFS - descriptor has been added to sysfs @@ -113,6 +116,8 @@ irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ int check_irq_resend(struct irq_desc *desc, bool inject); +void clear_irq_resend(struct irq_desc *desc); +void irq_resend_init(struct irq_desc *desc); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 240e145e969f..27ca1c866f29 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -12,8 +12,7 @@ #include <linux/export.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> -#include <linux/radix-tree.h> -#include <linux/bitmap.h> +#include <linux/maple_tree.h> #include <linux/irqdomain.h> #include <linux/sysfs.h> @@ -131,7 +130,40 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); -static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); +static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs, + MT_FLAGS_ALLOC_RANGE | + MT_FLAGS_LOCK_EXTERN | + MT_FLAGS_USE_RCU, + sparse_irq_lock); + +static int irq_find_free_area(unsigned int from, unsigned int cnt) +{ + MA_STATE(mas, &sparse_irqs, 0, 0); + + if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt)) + return -ENOSPC; + return mas.index; +} + +static unsigned int irq_find_at_or_after(unsigned int offset) +{ + unsigned long index = offset; + struct irq_desc *desc = mt_find(&sparse_irqs, &index, nr_irqs); + + return desc ? irq_desc_get_irq(desc) : nr_irqs; +} + +static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) +{ + MA_STATE(mas, &sparse_irqs, irq, irq); + WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0); +} + +static void delete_irq_desc(unsigned int irq) +{ + MA_STATE(mas, &sparse_irqs, irq, irq); + mas_erase(&mas); +} #ifdef CONFIG_SPARSE_IRQ @@ -344,26 +376,14 @@ static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ -static RADIX_TREE(irq_desc_tree, GFP_KERNEL); - -static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) -{ - radix_tree_insert(&irq_desc_tree, irq, desc); -} - struct irq_desc *irq_to_desc(unsigned int irq) { - return radix_tree_lookup(&irq_desc_tree, irq); + return mtree_load(&sparse_irqs, irq); } #ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE EXPORT_SYMBOL_GPL(irq_to_desc); #endif -static void delete_irq_desc(unsigned int irq) -{ - radix_tree_delete(&irq_desc_tree, irq); -} - #ifdef CONFIG_SMP static void free_masks(struct irq_desc *desc) { @@ -415,6 +435,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); kobject_init(&desc->kobj, &irq_kobj_type); + irq_resend_init(desc); return desc; @@ -505,7 +526,6 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, irq_sysfs_add(start + i, desc); irq_add_debugfs_entry(start + i, desc); } - bitmap_set(allocated_irqs, start, cnt); return start; err: @@ -516,7 +536,7 @@ err: static int irq_expand_nr_irqs(unsigned int nr) { - if (nr > IRQ_BITMAP_BITS) + if (nr > MAX_SPARSE_IRQS) return -ENOMEM; nr_irqs = nr; return 0; @@ -534,18 +554,17 @@ int __init early_irq_init(void) printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", NR_IRQS, nr_irqs, initcnt); - if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) - nr_irqs = IRQ_BITMAP_BITS; + if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS)) + nr_irqs = MAX_SPARSE_IRQS; - if (WARN_ON(initcnt > IRQ_BITMAP_BITS)) - initcnt = IRQ_BITMAP_BITS; + if (WARN_ON(initcnt > MAX_SPARSE_IRQS)) + initcnt = MAX_SPARSE_IRQS; if (initcnt > nr_irqs) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node, 0, NULL, NULL); - set_bit(i, allocated_irqs); irq_insert_desc(i, desc); } return arch_early_irq_init(); @@ -581,6 +600,7 @@ int __init early_irq_init(void) mutex_init(&desc[i].request_mutex); init_waitqueue_head(&desc[i].wait_for_threads); desc_set_defaults(i, &desc[i], node, NULL, NULL); + irq_resend_init(desc); } return arch_early_irq_init(); } @@ -599,6 +619,7 @@ static void free_desc(unsigned int irq) raw_spin_lock_irqsave(&desc->lock, flags); desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); + delete_irq_desc(irq); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, @@ -611,8 +632,8 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, struct irq_desc *desc = irq_to_desc(start + i); desc->owner = owner; + irq_insert_desc(start + i, desc); } - bitmap_set(allocated_irqs, start, cnt); return start; } @@ -624,7 +645,7 @@ static int irq_expand_nr_irqs(unsigned int nr) void irq_mark_irq(unsigned int irq) { mutex_lock(&sparse_irq_lock); - bitmap_set(allocated_irqs, irq, 1); + irq_insert_desc(irq, irq_desc + irq); mutex_unlock(&sparse_irq_lock); } @@ -768,7 +789,6 @@ void irq_free_descs(unsigned int from, unsigned int cnt) for (i = 0; i < cnt; i++) free_desc(from + i); - bitmap_clear(allocated_irqs, from, cnt); mutex_unlock(&sparse_irq_lock); } EXPORT_SYMBOL_GPL(irq_free_descs); @@ -810,8 +830,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, - from, cnt, 0); + start = irq_find_free_area(from, cnt); ret = -EEXIST; if (irq >=0 && start != irq) goto unlock; @@ -836,7 +855,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); */ unsigned int irq_get_next_irq(unsigned int offset) { - return find_next_bit(allocated_irqs, nr_irqs, offset); + return irq_find_at_or_after(offset); } struct irq_desc * diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f34760a1e222..5bd01624e447 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1915,6 +1915,8 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain) #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS +#include "internals.h" + static struct dentry *domain_dir; static void diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7a97bcb086bf..b4c31a5c1147 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -542,7 +542,7 @@ fail: return ret; } -#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS +#if defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) || defined(CONFIG_PCI_XEN) /** * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device * @dev: The device (PCI, platform etc) which will get sysfs entries @@ -574,7 +574,7 @@ void msi_device_destroy_sysfs(struct device *dev) msi_for_each_desc(desc, dev, MSI_DESC_ALL) msi_sysfs_remove_desc(dev, desc); } -#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */ +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK || CONFIG_PCI_XEN */ #else /* CONFIG_SYSFS */ static inline int msi_sysfs_create_group(struct device *dev) { return 0; } static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 0c46e9fe3a89..edec335c0a7a 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -21,8 +21,9 @@ #ifdef CONFIG_HARDIRQS_SW_RESEND -/* Bitmap to handle software resend of interrupts: */ -static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); +/* hlist_head to handle software resend of interrupts: */ +static HLIST_HEAD(irq_resend_list); +static DEFINE_RAW_SPINLOCK(irq_resend_lock); /* * Run software resends of IRQ's @@ -30,18 +31,17 @@ static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); static void resend_irqs(struct tasklet_struct *unused) { struct irq_desc *desc; - int irq; - - while (!bitmap_empty(irqs_resend, nr_irqs)) { - irq = find_first_bit(irqs_resend, nr_irqs); - clear_bit(irq, irqs_resend); - desc = irq_to_desc(irq); - if (!desc) - continue; - local_irq_disable(); + + raw_spin_lock_irq(&irq_resend_lock); + while (!hlist_empty(&irq_resend_list)) { + desc = hlist_entry(irq_resend_list.first, struct irq_desc, + resend_node); + hlist_del_init(&desc->resend_node); + raw_spin_unlock(&irq_resend_lock); desc->handle_irq(desc); - local_irq_enable(); + raw_spin_lock(&irq_resend_lock); } + raw_spin_unlock_irq(&irq_resend_lock); } /* Tasklet to handle resend: */ @@ -49,8 +49,6 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { - unsigned int irq = irq_desc_get_irq(desc); - /* * Validate whether this interrupt can be safely injected from * non interrupt context @@ -70,16 +68,31 @@ static int irq_sw_resend(struct irq_desc *desc) */ if (!desc->parent_irq) return -EINVAL; - irq = desc->parent_irq; } - /* Set it pending and activate the softirq: */ - set_bit(irq, irqs_resend); + /* Add to resend_list and activate the softirq: */ + raw_spin_lock(&irq_resend_lock); + hlist_add_head(&desc->resend_node, &irq_resend_list); + raw_spin_unlock(&irq_resend_lock); tasklet_schedule(&resend_tasklet); return 0; } +void clear_irq_resend(struct irq_desc *desc) +{ + raw_spin_lock(&irq_resend_lock); + hlist_del_init(&desc->resend_node); + raw_spin_unlock(&irq_resend_lock); +} + +void irq_resend_init(struct irq_desc *desc) +{ + INIT_HLIST_NODE(&desc->resend_node); +} #else +void clear_irq_resend(struct irq_desc *desc) {} +void irq_resend_init(struct irq_desc *desc) {} + static int irq_sw_resend(struct irq_desc *desc) { return -EINVAL; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 77747391f49b..7982cc9d497c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -484,34 +484,6 @@ found: return 0; } -int lookup_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - int res; - - name[0] = '\0'; - name[KSYM_NAME_LEN - 1] = '\0'; - - if (is_ksym_addr(addr)) { - unsigned long pos; - - pos = get_symbol_pos(addr, size, offset); - /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), - name, KSYM_NAME_LEN); - modname[0] = '\0'; - goto found; - } - /* See if it's in a module. */ - res = lookup_module_symbol_attrs(addr, size, offset, modname, name); - if (res) - return res; - -found: - cleanup_symbol_name(name); - return 0; -} - /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, int symbol_offset, int add_offset, int add_buildid) @@ -646,7 +618,6 @@ int sprint_backtrace_build_id(char *buffer, unsigned long address) /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; - loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; loff_t pos_bpf_end; @@ -659,29 +630,9 @@ struct kallsym_iter { int show_value; }; -int __weak arch_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name) -{ - return -EINVAL; -} - -static int get_ksymbol_arch(struct kallsym_iter *iter) -{ - int ret = arch_get_kallsym(iter->pos - kallsyms_num_syms, - &iter->value, &iter->type, - iter->name); - - if (ret < 0) { - iter->pos_arch_end = iter->pos; - return 0; - } - - return 1; -} - static int get_ksymbol_mod(struct kallsym_iter *iter) { - int ret = module_get_kallsym(iter->pos - iter->pos_arch_end, + int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); @@ -716,7 +667,7 @@ static int get_ksymbol_bpf(struct kallsym_iter *iter) { int ret; - strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); + strscpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, @@ -736,7 +687,7 @@ static int get_ksymbol_bpf(struct kallsym_iter *iter) */ static int get_ksymbol_kprobe(struct kallsym_iter *iter) { - strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); + strscpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); iter->exported = 0; return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, &iter->value, &iter->type, @@ -764,7 +715,6 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; if (new_pos == 0) { - iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; iter->pos_bpf_end = 0; @@ -780,10 +730,6 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; - if ((!iter->pos_arch_end || iter->pos_arch_end > pos) && - get_ksymbol_arch(iter)) - return 1; - if ((!iter->pos_mod_end || iter->pos_mod_end > pos) && get_ksymbol_mod(iter)) return 1; @@ -961,41 +907,6 @@ late_initcall(bpf_ksym_iter_register); #endif /* CONFIG_BPF_SYSCALL */ -static inline int kallsyms_for_perf(void) -{ -#ifdef CONFIG_PERF_EVENTS - extern int sysctl_perf_event_paranoid; - if (sysctl_perf_event_paranoid <= 1) - return 1; -#endif - return 0; -} - -/* - * We show kallsyms information even to normal users if we've enabled - * kernel profiling and are explicitly not paranoid (so kptr_restrict - * is clear, and sysctl_perf_event_paranoid isn't set). - * - * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to - * block even that). - */ -bool kallsyms_show_value(const struct cred *cred) -{ - switch (kptr_restrict) { - case 0: - if (kallsyms_for_perf()) - return true; - fallthrough; - case 1: - if (security_capable(cred, &init_user_ns, CAP_SYSLOG, - CAP_OPT_NOAUDIT) == 0) - return true; - fallthrough; - default: - return false; - } -} - static int kallsyms_open(struct inode *inode, struct file *file) { /* diff --git a/kernel/kcov.c b/kernel/kcov.c index 84c717337df0..f9ac2e9e460f 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -279,7 +279,7 @@ void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4); -void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2) +void notrace __sanitizer_cov_trace_cmp8(kcov_u64 arg1, kcov_u64 arg2) { write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_); } @@ -306,16 +306,17 @@ void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4); -void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2) +void notrace __sanitizer_cov_trace_const_cmp8(kcov_u64 arg1, kcov_u64 arg2) { write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2, _RET_IP_); } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8); -void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) +void notrace __sanitizer_cov_trace_switch(kcov_u64 val, void *arg) { u64 i; + u64 *cases = arg; u64 count = cases[0]; u64 size = cases[1]; u64 type = KCOV_CMP_CONST; diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 5a60cc52adc0..8a7baf4e332e 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -1270,7 +1270,9 @@ static __always_inline void kcsan_atomic_builtin_memorder(int memorder) DEFINE_TSAN_ATOMIC_OPS(8); DEFINE_TSAN_ATOMIC_OPS(16); DEFINE_TSAN_ATOMIC_OPS(32); +#ifdef CONFIG_64BIT DEFINE_TSAN_ATOMIC_OPS(64); +#endif void __tsan_atomic_thread_fence(int memorder); void __tsan_atomic_thread_fence(int memorder) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 3d578c6fefee..e2f2574d8b74 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1091,6 +1091,11 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) } } +static inline resource_size_t crash_resource_size(const struct resource *res) +{ + return !res->end ? 0 : resource_size(res); +} + ssize_t crash_get_memory_size(void) { ssize_t size = 0; @@ -1098,19 +1103,45 @@ ssize_t crash_get_memory_size(void) if (!kexec_trylock()) return -EBUSY; - if (crashk_res.end != crashk_res.start) - size = resource_size(&crashk_res); + size += crash_resource_size(&crashk_res); + size += crash_resource_size(&crashk_low_res); kexec_unlock(); return size; } +static int __crash_shrink_memory(struct resource *old_res, + unsigned long new_size) +{ + struct resource *ram_res; + + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) + return -ENOMEM; + + ram_res->start = old_res->start + new_size; + ram_res->end = old_res->end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; + ram_res->name = "System RAM"; + + if (!new_size) { + release_resource(old_res); + old_res->start = 0; + old_res->end = 0; + } else { + crashk_res.end = ram_res->start - 1; + } + + crash_free_reserved_phys_range(ram_res->start, ram_res->end); + insert_resource(&iomem_resource, ram_res); + + return 0; +} + int crash_shrink_memory(unsigned long new_size) { int ret = 0; - unsigned long start, end; - unsigned long old_size; - struct resource *ram_res; + unsigned long old_size, low_size; if (!kexec_trylock()) return -EBUSY; @@ -1119,36 +1150,42 @@ int crash_shrink_memory(unsigned long new_size) ret = -ENOENT; goto unlock; } - start = crashk_res.start; - end = crashk_res.end; - old_size = (end == 0) ? 0 : end - start + 1; + + low_size = crash_resource_size(&crashk_low_res); + old_size = crash_resource_size(&crashk_res) + low_size; + new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); if (new_size >= old_size) { ret = (new_size == old_size) ? 0 : -EINVAL; goto unlock; } - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) { - ret = -ENOMEM; - goto unlock; - } - - start = roundup(start, KEXEC_CRASH_MEM_ALIGN); - end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - - crash_free_reserved_phys_range(end, crashk_res.end); - - if ((start == end) && (crashk_res.parent != NULL)) - release_resource(&crashk_res); - - ram_res->start = end; - ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; - ram_res->name = "System RAM"; + /* + * (low_size > new_size) implies that low_size is greater than zero. + * This also means that if low_size is zero, the else branch is taken. + * + * If low_size is greater than 0, (low_size > new_size) indicates that + * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res + * needs to be shrunken. + */ + if (low_size > new_size) { + ret = __crash_shrink_memory(&crashk_res, 0); + if (ret) + goto unlock; - crashk_res.end = end - 1; + ret = __crash_shrink_memory(&crashk_low_res, new_size); + } else { + ret = __crash_shrink_memory(&crashk_res, new_size - low_size); + } - insert_resource(&iomem_resource, ram_res); + /* Swap crashk_res and crashk_low_res if needed */ + if (!crashk_res.end && crashk_low_res.end) { + crashk_res.start = crashk_low_res.start; + crashk_res.end = crashk_low_res.end; + release_resource(&crashk_low_res); + crashk_low_res.start = 0; + crashk_low_res.end = 0; + insert_resource(&iomem_resource, &crashk_res); + } unlock: kexec_unlock(); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f989f5f1933b..881ba0d1714c 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -867,6 +867,7 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, { unsigned long bss_addr; unsigned long offset; + size_t sechdrs_size; Elf_Shdr *sechdrs; int i; @@ -874,11 +875,11 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, * The section headers in kexec_purgatory are read-only. In order to * have them modifiable make a temporary copy. */ - sechdrs = vzalloc(array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum)); + sechdrs_size = array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum); + sechdrs = vzalloc(sechdrs_size); if (!sechdrs) return -ENOMEM; - memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, - pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, sechdrs_size); pi->sechdrs = sechdrs; offset = 0; @@ -901,10 +902,22 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, } offset = ALIGN(offset, align); + + /* + * Check if the segment contains the entry point, if so, + * calculate the value of image->start based on it. + * If the compiler has produced more than one .text section + * (Eg: .text.hot), they are generally after the main .text + * section, and they shall not be used to calculate + * image->start. So do not re-calculate image->start if it + * is not set to the initial value, and warn the user so they + * have a chance to fix their purgatory's linker script. + */ if (sechdrs[i].sh_flags & SHF_EXECINSTR && pi->ehdr->e_entry >= sechdrs[i].sh_addr && pi->ehdr->e_entry < (sechdrs[i].sh_addr - + sechdrs[i].sh_size)) { + + sechdrs[i].sh_size) && + !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) { kbuf->image->start -= sechdrs[i].sh_addr; kbuf->image->start += kbuf->mem + offset; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 00e177de91cc..ce13f1a35251 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2127,6 +2127,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) NOKPROBE_SYMBOL(pre_handler_kretprobe); static void kretprobe_rethook_handler(struct rethook_node *rh, void *data, + unsigned long ret_addr, struct pt_regs *regs) { struct kretprobe *rp = (struct kretprobe *)data; diff --git a/kernel/ksyms_common.c b/kernel/ksyms_common.c new file mode 100644 index 000000000000..cf1a73cbf2f6 --- /dev/null +++ b/kernel/ksyms_common.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ksyms_common.c: A split of kernel/kallsyms.c + * Contains a few generic function definations independent of config KALLSYMS. + */ +#include <linux/kallsyms.h> +#include <linux/security.h> + +static inline int kallsyms_for_perf(void) +{ +#ifdef CONFIG_PERF_EVENTS + extern int sysctl_perf_event_paranoid; + + if (sysctl_perf_event_paranoid <= 1) + return 1; +#endif + return 0; +} + +/* + * We show kallsyms information even to normal users if we've enabled + * kernel profiling and are explicitly not paranoid (so kptr_restrict + * is clear, and sysctl_perf_event_paranoid isn't set). + * + * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to + * block even that). + */ +bool kallsyms_show_value(const struct cred *cred) +{ + switch (kptr_restrict) { + case 0: + if (kallsyms_for_perf()) + return true; + fallthrough; + case 1: + if (security_capable(cred, &init_user_ns, CAP_SYSLOG, + CAP_OPT_NOAUDIT) == 0) + return true; + fallthrough; + default: + return false; + } +} diff --git a/kernel/kthread.c b/kernel/kthread.c index 490792b1066e..4fff7df17a68 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -182,6 +182,16 @@ bool kthread_should_park(void) } EXPORT_SYMBOL_GPL(kthread_should_park); +bool kthread_should_stop_or_park(void) +{ + struct kthread *kthread = __to_kthread(current); + + if (!kthread) + return false; + + return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); +} + /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen @@ -312,10 +322,10 @@ void __noreturn kthread_exit(long result) * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * - * If present complete @comp and the reuturn code to kthread_stop(). + * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of - * @comp can use this function exit safely. + * @comp can use this function to exit safely. * * Does not return. */ diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index 8c7e7d25f09c..a6016b91803d 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -57,4 +57,8 @@ static inline void __lockevent_add(enum lock_events event, int inc) #define lockevent_cond_inc(ev, c) #endif /* CONFIG_LOCK_EVENT_COUNTS */ + +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos); + #endif /* __LOCKING_LOCK_EVENTS_H */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index dcd1d5bfc1e0..111607d91489 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -709,7 +709,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } -static void __print_lock_name(struct lock_class *class) +static void __print_lock_name(struct held_lock *hlock, struct lock_class *class) { char str[KSYM_NAME_LEN]; const char *name; @@ -724,17 +724,19 @@ static void __print_lock_name(struct lock_class *class) printk(KERN_CONT "#%d", class->name_version); if (class->subclass) printk(KERN_CONT "/%d", class->subclass); + if (hlock && class->print_fn) + class->print_fn(hlock->instance); } } -static void print_lock_name(struct lock_class *class) +static void print_lock_name(struct held_lock *hlock, struct lock_class *class) { char usage[LOCK_USAGE_CHARS]; get_usage_chars(class, usage); printk(KERN_CONT " ("); - __print_lock_name(class); + __print_lock_name(hlock, class); printk(KERN_CONT "){%s}-{%d:%d}", usage, class->wait_type_outer ?: class->wait_type_inner, class->wait_type_inner); @@ -772,7 +774,7 @@ static void print_lock(struct held_lock *hlock) } printk(KERN_CONT "%px", hlock->instance); - print_lock_name(lock); + print_lock_name(hlock, lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } @@ -1868,7 +1870,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) if (debug_locks_silent) return; printk("\n-> #%u", depth); - print_lock_name(target->class); + print_lock_name(NULL, target->class); printk(KERN_CONT ":\n"); print_lock_trace(target->trace, 6); } @@ -1899,11 +1901,11 @@ print_circular_lock_scenario(struct held_lock *src, */ if (parent != source) { printk("Chain exists of:\n "); - __print_lock_name(source); + __print_lock_name(src, source); printk(KERN_CONT " --> "); - __print_lock_name(parent); + __print_lock_name(NULL, parent); printk(KERN_CONT " --> "); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT "\n\n"); } @@ -1914,13 +1916,13 @@ print_circular_lock_scenario(struct held_lock *src, printk(" rlock("); else printk(" lock("); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(parent); + __print_lock_name(NULL, parent); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT ");\n"); if (src_read != 0) printk(" rlock("); @@ -1928,7 +1930,7 @@ print_circular_lock_scenario(struct held_lock *src, printk(" sync("); else printk(" lock("); - __print_lock_name(source); + __print_lock_name(src, source); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -2154,6 +2156,8 @@ check_path(struct held_lock *target, struct lock_list *src_entry, return ret; } +static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *); + /* * Prove that the dependency graph starting at <src> can not * lead to <target>. If it can, there is a circle when adding @@ -2185,7 +2189,10 @@ check_noncircular(struct held_lock *src, struct held_lock *target, *trace = save_trace(); } - print_circular_bug(&src_entry, target_entry, src, target); + if (src->class_idx == target->class_idx) + print_deadlock_bug(current, src, target); + else + print_circular_bug(&src_entry, target_entry, src, target); } return ret; @@ -2263,6 +2270,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask) static inline bool usage_skip(struct lock_list *entry, void *mask) { + if (entry->class->lock_type == LD_LOCK_NORMAL) + return false; + /* * Skip local_lock() for irq inversion detection. * @@ -2289,14 +2299,16 @@ static inline bool usage_skip(struct lock_list *entry, void *mask) * As a result, we will skip local_lock(), when we search for irq * inversion bugs. */ - if (entry->class->lock_type == LD_LOCK_PERCPU) { - if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) - return false; + if (entry->class->lock_type == LD_LOCK_PERCPU && + DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) + return false; - return true; - } + /* + * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually + * a lock and only used to override the wait_type. + */ - return false; + return true; } /* @@ -2341,7 +2353,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) int bit; printk("%*s->", depth, ""); - print_lock_name(class); + print_lock_name(NULL, class); #ifdef CONFIG_DEBUG_LOCKDEP printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); #endif @@ -2523,11 +2535,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry, */ if (middle_class != unsafe_class) { printk("Chain exists of:\n "); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT " --> "); - __print_lock_name(middle_class); + __print_lock_name(NULL, middle_class); printk(KERN_CONT " --> "); - __print_lock_name(unsafe_class); + __print_lock_name(NULL, unsafe_class); printk(KERN_CONT "\n\n"); } @@ -2535,18 +2547,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk(" CPU0 CPU1\n"); printk(" ---- ----\n"); printk(" lock("); - __print_lock_name(unsafe_class); + __print_lock_name(NULL, unsafe_class); printk(KERN_CONT ");\n"); printk(" local_irq_disable();\n"); printk(" lock("); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(middle_class); + __print_lock_name(NULL, middle_class); printk(KERN_CONT ");\n"); printk(" <Interrupt>\n"); printk(" lock("); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -2583,20 +2595,20 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("\nand this task is already holding:\n"); print_lock(prev); pr_warn("which would create a new lock dependency:\n"); - print_lock_name(hlock_class(prev)); + print_lock_name(prev, hlock_class(prev)); pr_cont(" ->"); - print_lock_name(hlock_class(next)); + print_lock_name(next, hlock_class(next)); pr_cont("\n"); pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); - print_lock_name(backwards_entry->class); + print_lock_name(NULL, backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); print_lock_trace(backwards_entry->class->usage_traces[bit1], 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_entry->class); + print_lock_name(NULL, forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); @@ -2966,10 +2978,10 @@ print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) printk(" CPU0\n"); printk(" ----\n"); printk(" lock("); - __print_lock_name(prev); + __print_lock_name(prv, prev); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(next); + __print_lock_name(nxt, next); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); printk(" May be due to missing lock nesting notation\n\n"); @@ -2979,6 +2991,8 @@ static void print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { + struct lock_class *class = hlock_class(prev); + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; @@ -2993,6 +3007,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nbut task is already holding lock:\n"); print_lock(prev); + if (class->cmp_fn) { + pr_warn("and the lock comparison function returns %i:\n", + class->cmp_fn(prev->instance, next->instance)); + } + pr_warn("\nother info that might help us debug this:\n"); print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); @@ -3014,6 +3033,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, static int check_deadlock(struct task_struct *curr, struct held_lock *next) { + struct lock_class *class; struct held_lock *prev; struct held_lock *nest = NULL; int i; @@ -3034,6 +3054,12 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) if ((next->read == 2) && prev->read) continue; + class = hlock_class(prev); + + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + continue; + /* * We're holding the nest_lock, which serializes this lock's * nesting behaviour. @@ -3095,6 +3121,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return 2; } + if (prev->class_idx == next->class_idx) { + struct lock_class *class = hlock_class(prev); + + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + return 2; + } + /* * Prove that the new <prev> -> <next> dependency would not * create a circular dependency in the graph. (We do this by @@ -3571,7 +3605,7 @@ static void print_chain_keys_chain(struct lock_chain *chain) hlock_id = chain_hlocks[chain->base + i]; chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id)); + print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id)); printk("\n"); } } @@ -3928,11 +3962,11 @@ static void print_usage_bug_scenario(struct held_lock *lock) printk(" CPU0\n"); printk(" ----\n"); printk(" lock("); - __print_lock_name(class); + __print_lock_name(lock, class); printk(KERN_CONT ");\n"); printk(" <Interrupt>\n"); printk(" lock("); - __print_lock_name(class); + __print_lock_name(lock, class); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -4018,7 +4052,7 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); - print_lock_name(other->class); + print_lock_name(NULL, other->class); pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); pr_warn("\nother info that might help us debug this:\n"); @@ -4768,7 +4802,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) for (; depth < curr->lockdep_depth; depth++) { struct held_lock *prev = curr->held_locks + depth; - u8 prev_inner = hlock_class(prev)->wait_type_inner; + struct lock_class *class = hlock_class(prev); + u8 prev_inner = class->wait_type_inner; if (prev_inner) { /* @@ -4778,6 +4813,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) * Also due to trylocks. */ curr_inner = min(curr_inner, prev_inner); + + /* + * Allow override for annotations -- this is typically + * only valid/needed for code that only exists when + * CONFIG_PREEMPT_RT=n. + */ + if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE)) + curr_inner = prev_inner; } } @@ -4882,6 +4925,33 @@ EXPORT_SYMBOL_GPL(lockdep_init_map_type); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); +#ifdef CONFIG_PROVE_LOCKING +void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn, + lock_print_fn print_fn) +{ + struct lock_class *class = lock->class_cache[0]; + unsigned long flags; + + raw_local_irq_save(flags); + lockdep_recursion_inc(); + + if (!class) + class = register_lock_class(lock, 0, 0); + + if (class) { + WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn); + WARN_ON(class->print_fn && class->print_fn != print_fn); + + class->cmp_fn = cmp_fn; + class->print_fn = print_fn; + } + + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn); +#endif + static void print_lock_nested_lock_not_held(struct task_struct *curr, struct held_lock *hlock) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 153ddc4c47ef..949d3deae506 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -33,24 +33,19 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); -torture_param(int, nwriters_stress, -1, - "Number of write-locking stress-test threads"); -torture_param(int, nreaders_stress, -1, - "Number of read-locking stress-test threads"); +torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); +torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); -torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); -torture_param(int, shuffle_interval, 3, - "Number of jiffies between shuffles, 0=disable"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); -torture_param(int, stat_interval, 60, - "Number of seconds between stats printk()s"); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(int, rt_boost, 2, - "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); -torture_param(int, verbose, 1, - "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ #define MAX_NESTED_LOCKS 8 @@ -120,7 +115,7 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused) static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % @@ -198,16 +193,18 @@ __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; + unsigned long j; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) { + j = jiffies; mdelay(longdelay_ms); - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j); + } + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ @@ -322,7 +319,7 @@ __acquires(torture_rwlock) static void torture_rwlock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. @@ -455,14 +452,12 @@ __acquires(torture_mutex) static void torture_mutex_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 5); - else - mdelay(longdelay_ms / 5); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -630,7 +625,7 @@ __acquires(torture_rtmutex) static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* * We want a short delay mostly to emulate likely code, and @@ -640,7 +635,7 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp) (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms); if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ @@ -695,14 +690,12 @@ __acquires(torture_rwsem) static void torture_rwsem_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 10); - else - mdelay(longdelay_ms / 10); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -848,8 +841,8 @@ static int lock_torture_writer(void *arg) lwsp->n_lock_acquired++; } - cxt.cur_ops->write_delay(&rand); if (!skip_main_lock) { + cxt.cur_ops->write_delay(&rand); lock_is_write_held = false; WRITE_ONCE(last_lock_release, jiffies); cxt.cur_ops->writeunlock(tid); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index acb5a50309a1..9eabd585ce7a 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1240,7 +1240,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* * lock for reading */ -static inline int __down_read_common(struct rw_semaphore *sem, int state) +static __always_inline int __down_read_common(struct rw_semaphore *sem, int state) { int ret = 0; long count; @@ -1258,17 +1258,17 @@ out: return ret; } -static inline void __down_read(struct rw_semaphore *sem) +static __always_inline void __down_read(struct rw_semaphore *sem) { __down_read_common(sem, TASK_UNINTERRUPTIBLE); } -static inline int __down_read_interruptible(struct rw_semaphore *sem) +static __always_inline int __down_read_interruptible(struct rw_semaphore *sem) { return __down_read_common(sem, TASK_INTERRUPTIBLE); } -static inline int __down_read_killable(struct rw_semaphore *sem) +static __always_inline int __down_read_killable(struct rw_semaphore *sem) { return __down_read_common(sem, TASK_KILLABLE); } diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index e97232b125eb..8a5d6d63b06c 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -257,7 +257,7 @@ static ssize_t module_zstd_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); - if (!IS_ERR(page)) { + if (IS_ERR(page)) { retval = PTR_ERR(page); goto out; } diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index c550d7d45f2f..ef73ae7c8909 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -381,34 +381,6 @@ out: return -ERANGE; } -int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - const char *sym; - - sym = find_kallsyms_symbol(mod, addr, size, offset); - if (!sym) - goto out; - if (modname) - strscpy(modname, mod->name, MODULE_NAME_LEN); - if (name) - strscpy(name, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported) { diff --git a/kernel/module/main.c b/kernel/module/main.c index 044aa2c9e3cb..834de86ebe35 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -820,10 +820,8 @@ static struct module_attribute modinfo_refcnt = void __module_get(struct module *module) { if (module) { - preempt_disable(); atomic_inc(&module->refcnt); trace_module_get(module, _RET_IP_); - preempt_enable(); } } EXPORT_SYMBOL(__module_get); @@ -833,15 +831,12 @@ bool try_module_get(struct module *module) bool ret = true; if (module) { - preempt_disable(); /* Note: here, we can fail to get a reference */ if (likely(module_is_live(module) && atomic_inc_not_zero(&module->refcnt) != 0)) trace_module_get(module, _RET_IP_); else ret = false; - - preempt_enable(); } return ret; } @@ -852,11 +847,9 @@ void module_put(struct module *module) int ret; if (module) { - preempt_disable(); ret = atomic_dec_if_positive(&module->refcnt); WARN_ON(ret < 0); /* Failed to put refcount */ trace_module_put(module, _RET_IP_); - preempt_enable(); } } EXPORT_SYMBOL(module_put); @@ -1521,14 +1514,14 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i MOD_RODATA, MOD_RO_AFTER_INIT, MOD_DATA, - MOD_INVALID, /* This is needed to match the masks array */ + MOD_DATA, }; static const int init_m_to_mem_type[] = { MOD_INIT_TEXT, MOD_INIT_RODATA, MOD_INVALID, MOD_INIT_DATA, - MOD_INVALID, /* This is needed to match the masks array */ + MOD_INIT_DATA, }; for (m = 0; m < ARRAY_SIZE(masks); ++m) { @@ -3057,26 +3050,83 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, return load_module(&info, uargs, 0); } -SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +struct idempotent { + const void *cookie; + struct hlist_node entry; + struct completion complete; + int ret; +}; + +#define IDEM_HASH_BITS 8 +static struct hlist_head idem_hash[1 << IDEM_HASH_BITS]; +static DEFINE_SPINLOCK(idem_lock); + +static bool idempotent(struct idempotent *u, const void *cookie) { + int hash = hash_ptr(cookie, IDEM_HASH_BITS); + struct hlist_head *head = idem_hash + hash; + struct idempotent *existing; + bool first; + + u->ret = 0; + u->cookie = cookie; + init_completion(&u->complete); + + spin_lock(&idem_lock); + first = true; + hlist_for_each_entry(existing, head, entry) { + if (existing->cookie != cookie) + continue; + first = false; + break; + } + hlist_add_head(&u->entry, idem_hash + hash); + spin_unlock(&idem_lock); + + return !first; +} + +/* + * We were the first one with 'cookie' on the list, and we ended + * up completing the operation. We now need to walk the list, + * remove everybody - which includes ourselves - fill in the return + * value, and then complete the operation. + */ +static void idempotent_complete(struct idempotent *u, int ret) +{ + const void *cookie = u->cookie; + int hash = hash_ptr(cookie, IDEM_HASH_BITS); + struct hlist_head *head = idem_hash + hash; + struct hlist_node *next; + struct idempotent *pos; + + spin_lock(&idem_lock); + hlist_for_each_entry_safe(pos, next, head, entry) { + if (pos->cookie != cookie) + continue; + hlist_del(&pos->entry); + pos->ret = ret; + complete(&pos->complete); + } + spin_unlock(&idem_lock); +} + +static int init_module_from_file(struct file *f, const char __user * uargs, int flags) +{ + struct idempotent idem; struct load_info info = { }; void *buf = NULL; - int len; - int err; + int len, ret; - err = may_init_module(); - if (err) - return err; - - pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); + if (!f || !(f->f_mode & FMODE_READ)) + return -EBADF; - if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS - |MODULE_INIT_IGNORE_VERMAGIC - |MODULE_INIT_COMPRESSED_FILE)) - return -EINVAL; + if (idempotent(&idem, file_inode(f))) { + wait_for_completion(&idem.complete); + return idem.ret; + } - len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, - READING_MODULE); + len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE); if (len < 0) { mod_stat_inc(&failed_kreads); mod_stat_add_long(len, &invalid_kread_bytes); @@ -3084,7 +3134,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) } if (flags & MODULE_INIT_COMPRESSED_FILE) { - err = module_decompress(&info, buf, len); + int err = module_decompress(&info, buf, len); vfree(buf); /* compressed data is no longer needed */ if (err) { mod_stat_inc(&failed_decompress); @@ -3096,7 +3146,31 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) info.len = len; } - return load_module(&info, uargs, flags); + ret = load_module(&info, uargs, flags); + idempotent_complete(&idem, ret); + return ret; +} + +SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +{ + int err; + struct fd f; + + err = may_init_module(); + if (err) + return err; + + pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); + + if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS + |MODULE_INIT_IGNORE_VERMAGIC + |MODULE_INIT_COMPRESSED_FILE)) + return -EINVAL; + + f = fdget(fd); + err = init_module_from_file(f.file, uargs, flags); + fdput(f); + return err; } /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ diff --git a/kernel/module/stats.c b/kernel/module/stats.c index ad7b6ada29f2..6ab2c94d6bc3 100644 --- a/kernel/module/stats.c +++ b/kernel/module/stats.c @@ -276,6 +276,7 @@ static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf, struct mod_fail_load *mod_fail; unsigned int len, size, count_failed = 0; char *buf; + int ret; u32 live_mod_count, fkreads, fdecompress, fbecoming, floads; unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes, idecompress_bytes, imod_bytes, total_virtual_lost; @@ -390,8 +391,9 @@ static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf, out_unlock: mutex_unlock(&module_mutex); out: + ret = simple_read_from_buffer(user_buf, count, ppos, buf, len); kfree(buf); - return simple_read_from_buffer(user_buf, count, ppos, buf, len); + return ret; } #undef MAX_PREAMBLE #undef MAX_FAILED_MOD_PRINT diff --git a/kernel/panic.c b/kernel/panic.c index 886d2ebd0a0d..10effe40a3fa 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -684,6 +684,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, add_taint(taint, LOCKDEP_STILL_OK); } +#ifdef CONFIG_BUG #ifndef __WARN_FLAGS void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) @@ -722,8 +723,6 @@ void __warn_printk(const char *fmt, ...) EXPORT_SYMBOL(__warn_printk); #endif -#ifdef CONFIG_BUG - /* Support resetting WARN*_ONCE state */ static int clear_warn_once_set(void *data, u64 val) diff --git a/kernel/params.c b/kernel/params.c index 6a7548979aa9..07d01f6ce9a2 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -847,7 +847,7 @@ static void __init param_sysfs_builtin(void) name_len = 0; } else { name_len = dot - kp->name + 1; - strlcpy(modname, kp->name, name_len); + strscpy(modname, kp->name, name_len); } kernel_add_sysfs_param(modname, kp, name_len); } diff --git a/kernel/pid.c b/kernel/pid.c index f93954a0384d..8bce3aebc949 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -656,8 +656,11 @@ void __init pid_idr_init(void) idr_init(&init_pid_ns.idr); - init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); + init_pid_ns.pid_cachep = kmem_cache_create("pid", + struct_size((struct pid *)NULL, numbers, 1), + __alignof__(struct pid), + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); } static struct file *__pidfd_fget(struct task_struct *task, int fd) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b43eee07b00c..70a929784a5d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -48,7 +48,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) return kc; snprintf(name, sizeof(name), "pid_%u", level + 1); - len = sizeof(struct pid) + level * sizeof(struct upid); + len = struct_size((struct pid *)NULL, numbers, level + 1); mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index d67a4d45bb42..b26e027fc9cd 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -52,7 +52,6 @@ static inline void register_pid_ns_sysctl_table_vm(void) } #else static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {} -static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {} static inline void register_pid_ns_sysctl_table_vm(void) {} #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 30d1274f03f6..f62e89d0d906 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "PM: hibernation: " fmt +#include <linux/blkdev.h> #include <linux/export.h> #include <linux/suspend.h> #include <linux/reboot.h> @@ -64,7 +65,6 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; bool freezer_test_done; -bool snapshot_test; static const struct platform_hibernation_ops *hibernation_ops; @@ -684,26 +684,22 @@ static void power_down(void) cpu_relax(); } -static int load_image_and_restore(void) +static int load_image_and_restore(bool snapshot_test) { int error; unsigned int flags; - fmode_t mode = FMODE_READ; - - if (snapshot_test) - mode |= FMODE_EXCL; pm_pr_dbg("Loading hibernation image.\n"); lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) { - swsusp_close(mode); + swsusp_close(snapshot_test); goto Unlock; } error = swsusp_read(&flags); - swsusp_close(mode); + swsusp_close(snapshot_test); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); @@ -721,6 +717,7 @@ static int load_image_and_restore(void) */ int hibernate(void) { + bool snapshot_test = false; unsigned int sleep_flags; int error; @@ -748,9 +745,6 @@ int hibernate(void) if (error) goto Exit; - /* protected by system_transition_mutex */ - snapshot_test = false; - lock_device_hotplug(); /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); @@ -792,9 +786,9 @@ int hibernate(void) unlock_device_hotplug(); if (snapshot_test) { pm_pr_dbg("Checking hibernation image\n"); - error = swsusp_check(); + error = swsusp_check(snapshot_test); if (!error) - error = load_image_and_restore(); + error = load_image_and_restore(snapshot_test); } thaw_processes(); @@ -910,52 +904,10 @@ unlock: } EXPORT_SYMBOL_GPL(hibernate_quiet_exec); -/** - * software_resume - Resume from a saved hibernation image. - * - * This routine is called as a late initcall, when all devices have been - * discovered and initialized already. - * - * The image reading code is called to see if there is a hibernation image - * available for reading. If that is the case, devices are quiesced and the - * contents of memory is restored from the saved image. - * - * If this is successful, control reappears in the restored target kernel in - * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine - * attempts to recover gracefully and make the kernel return to the normal mode - * of operation. - */ -static int software_resume(void) +static int __init find_resume_device(void) { - int error; - - /* - * If the user said "noresume".. bail out early. - */ - if (noresume || !hibernation_available()) - return 0; - - /* - * name_to_dev_t() below takes a sysfs buffer mutex when sysfs - * is configured into the kernel. Since the regular hibernate - * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take system_transition_mutex) - * this can cause lockdep to complain about a possible ABBA deadlock - * which cannot happen since we're in the boot code here and - * sysfs can't be invoked yet. Therefore, we use a subclass - * here to avoid lockdep complaining. - */ - mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); - - snapshot_test = false; - - if (swsusp_resume_device) - goto Check_image; - - if (!strlen(resume_file)) { - error = -ENOENT; - goto Unlock; - } + if (!strlen(resume_file)) + return -ENOENT; pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); @@ -966,40 +918,41 @@ static int software_resume(void) } /* Check if the device is there */ - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - /* - * Some device discovery might still be in progress; we need - * to wait for this to finish. - */ - wait_for_device_probe(); - - if (resume_wait) { - while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) - msleep(10); - async_synchronize_full(); - } + if (!early_lookup_bdev(resume_file, &swsusp_resume_device)) + return 0; - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - error = -ENODEV; - goto Unlock; - } + /* + * Some device discovery might still be in progress; we need to wait for + * this to finish. + */ + wait_for_device_probe(); + if (resume_wait) { + while (early_lookup_bdev(resume_file, &swsusp_resume_device)) + msleep(10); + async_synchronize_full(); } - Check_image: + return early_lookup_bdev(resume_file, &swsusp_resume_device); +} + +static int software_resume(void) +{ + int error; + pm_pr_dbg("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); pm_pr_dbg("Looking for hibernation image.\n"); - error = swsusp_check(); + + mutex_lock(&system_transition_mutex); + error = swsusp_check(false); if (error) goto Unlock; /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(false); goto Unlock; } @@ -1020,7 +973,7 @@ static int software_resume(void) goto Close_Finish; } - error = load_image_and_restore(); + error = load_image_and_restore(false); thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); @@ -1034,11 +987,43 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(false); goto Finish; } -late_initcall_sync(software_resume); +/** + * software_resume_initcall - Resume from a saved hibernation image. + * + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. + * + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. + * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. + */ +static int __init software_resume_initcall(void) +{ + /* + * If the user said "noresume".. bail out early. + */ + if (noresume || !hibernation_available()) + return 0; + + if (!swsusp_resume_device) { + int error = find_resume_device(); + + if (error) + return error; + } + + return software_resume(); +} +late_initcall_sync(software_resume_initcall); static const char * const hibernation_modes[] = { @@ -1177,7 +1162,11 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, unsigned int sleep_flags; int len = n; char *name; - dev_t res; + dev_t dev; + int error; + + if (!hibernation_available()) + return 0; if (len && buf[len-1] == '\n') len--; @@ -1185,13 +1174,29 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (!name) return -ENOMEM; - res = name_to_dev_t(name); + error = lookup_bdev(name, &dev); + if (error) { + unsigned maj, min, offset; + char *p, dummy; + + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, + &dummy) == 3) { + dev = MKDEV(maj, min); + if (maj != MAJOR(dev) || min != MINOR(dev)) + error = -EINVAL; + } else { + dev = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + error = -EINVAL; + } + } kfree(name); - if (!res) - return -EINVAL; + if (error) + return error; sleep_flags = lock_system_sleep(); - swsusp_resume_device = res; + swsusp_resume_device = dev; unlock_system_sleep(sleep_flags); pm_pr_dbg("Configured hibernation resume from disk to %u\n", diff --git a/kernel/power/main.c b/kernel/power/main.c index 3113ec2f1db4..f6425ae3e8b0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,6 +21,33 @@ #include "power.h" #ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). + */ +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } +} + +void pm_restrict_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); +} unsigned int lock_system_sleep(void) { @@ -556,6 +583,12 @@ power_attr_ro(pm_wakeup_irq); bool pm_debug_messages_on __read_mostly; +bool pm_debug_messages_should_print(void) +{ + return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; +} +EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); + static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { diff --git a/kernel/power/power.h b/kernel/power/power.h index b83c8d5e188d..46eb14dc50c3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -26,9 +26,6 @@ extern void __init hibernate_image_size_init(void); /* Maximum size of architecture specific data in a hibernation header */ #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) -extern int arch_hibernation_header_save(void *addr, unsigned int max_size); -extern int arch_hibernation_header_restore(void *addr); - static inline int init_header_complete(struct swsusp_info *info) { return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); @@ -41,8 +38,6 @@ static inline const char *check_image_kernel(struct swsusp_info *info) } #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ -extern int hibernate_resume_nonboot_cpu_disable(void); - /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] @@ -59,7 +54,6 @@ asmlinkage int swsusp_save(void); /* kernel/power/hibernate.c */ extern bool freezer_test_done; -extern bool snapshot_test; extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); @@ -174,11 +168,11 @@ extern int swsusp_swap_in_use(void); #define SF_HW_SIG 8 /* kernel/power/hibernate.c */ -extern int swsusp_check(void); +int swsusp_check(bool snapshot_test); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -extern void swsusp_close(fmode_t); +void swsusp_close(bool snapshot_test); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); #endif @@ -216,6 +210,11 @@ static inline void suspend_test_finish(const char *label) {} /* kernel/power/main.c */ extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); +void pm_restrict_gfp_mask(void); +void pm_restore_gfp_mask(void); +#else +static inline void pm_restrict_gfp_mask(void) {} +static inline void pm_restore_gfp_mask(void) {} #endif #ifdef CONFIG_HIGHMEM diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cd8b7b35f1e8..0415d5ecb977 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -398,7 +398,7 @@ struct mem_zone_bm_rtree { unsigned int blocks; /* Number of Bitmap Blocks */ }; -/* strcut bm_position is used for browsing memory bitmaps */ +/* struct bm_position is used for browsing memory bitmaps */ struct bm_position { struct mem_zone_bm_rtree *zone; @@ -1228,6 +1228,58 @@ unsigned int snapshot_additional_pages(struct zone *zone) return 2 * rtree; } +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + +static void mark_free_pages(struct zone *zone) +{ + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; + unsigned long flags; + unsigned int order, t; + struct page *page; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(&zone->lock, flags); + + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + + if (page_zone(page) != zone) + continue; + + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } + + for_each_migratetype_order(order, t) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], buddy_list) { + unsigned long i; + + pfn = page_to_pfn(page); + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + swsusp_set_page_free(pfn_to_page(pfn + i)); + } + } + } + spin_unlock_irqrestore(&zone->lock, flags); +} + #ifdef CONFIG_HIGHMEM /** * count_free_highmem_pages - Compute the total number of free highmem pages. diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 92e41ed292ad..f6ebcd00c410 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -356,14 +356,14 @@ static int swsusp_swap_check(void) return res; root_swap = res; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, - NULL); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + BLK_OPEN_WRITE, NULL, NULL); if (IS_ERR(hib_resume_bdev)) return PTR_ERR(hib_resume_bdev); res = set_blocksize(hib_resume_bdev, PAGE_SIZE); if (res < 0) - blkdev_put(hib_resume_bdev, FMODE_WRITE); + blkdev_put(hib_resume_bdev, NULL); return res; } @@ -443,7 +443,7 @@ static int get_swap_writer(struct swap_map_handle *handle) err_rel: release_swap_writer(handle); err_close: - swsusp_close(FMODE_WRITE); + swsusp_close(false); return ret; } @@ -508,7 +508,7 @@ static int swap_writer_finish(struct swap_map_handle *handle, if (error) free_all_swap_pages(root_swap); release_swap_writer(handle); - swsusp_close(FMODE_WRITE); + swsusp_close(false); return error; } @@ -1510,21 +1510,19 @@ end: return error; } +static void *swsusp_holder; + /** * swsusp_check - Check for swsusp signature in the resume device */ -int swsusp_check(void) +int swsusp_check(bool snapshot_test) { + void *holder = snapshot_test ? &swsusp_holder : NULL; int error; - void *holder; - fmode_t mode = FMODE_READ; - if (snapshot_test) - mode |= FMODE_EXCL; - - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - mode, &holder); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ, + holder, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); @@ -1551,7 +1549,7 @@ int swsusp_check(void) put: if (error) - blkdev_put(hib_resume_bdev, mode); + blkdev_put(hib_resume_bdev, holder); else pr_debug("Image signature found, resuming\n"); } else { @@ -1568,14 +1566,14 @@ put: * swsusp_close - close swap device. */ -void swsusp_close(fmode_t mode) +void swsusp_close(bool snapshot_test) { if (IS_ERR(hib_resume_bdev)) { pr_debug("Image device not initialised\n"); return; } - blkdev_put(hib_resume_bdev, mode); + blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL); } /** diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6a333adce3b3..357a4d18f638 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -528,7 +528,7 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls) seq = raw_read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; - } while (read_seqcount_latch_retry(&ls->latch, seq)); + } while (raw_read_seqcount_latch_retry(&ls->latch, seq)); return val; } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 9071182b1284..bdd7eadb33d8 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -314,4 +314,22 @@ config RCU_LAZY To save power, batch RCU callbacks and flush after delay, memory pressure, or callback list growing too big. +config RCU_DOUBLE_CHECK_CB_TIME + bool "RCU callback-batch backup time check" + depends on RCU_EXPERT + default n + help + Use this option to provide more precise enforcement of the + rcutree.rcu_resched_ns module parameter in situations where + a single RCU callback might run for hundreds of microseconds, + thus defeating the 32-callback batching used to amortize the + cost of the fine-grained but expensive local_clock() function. + + This option rounds rcutree.rcu_resched_ns up to the next + jiffy, and overrides the 32-callback batching if this limit + is exceeded. + + Say Y here if you need tighter callback-limit enforcement. + Say N here if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4a1b9622598b..98c1544cf572 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -642,4 +642,10 @@ void show_rcu_tasks_trace_gp_kthread(void); static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif +#ifdef CONFIG_TINY_RCU +static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } +#else +bool rcu_cpu_beenfullyonline(int cpu); +#endif + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index e82ec9f9a5d8..d1221731c7cf 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -522,89 +522,6 @@ rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); } -static void -rcu_scale_cleanup(void) -{ - int i; - int j; - int ngps = 0; - u64 *wdp; - u64 *wdpp; - - /* - * Would like warning at start, but everything is expedited - * during the mid-boot phase, so have to wait till the end. - */ - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - if (rcu_gp_is_normal() && gp_exp) - SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - if (gp_exp && gp_async) - SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); - - if (torture_cleanup_begin()) - return; - if (!cur_ops) { - torture_cleanup_end(); - return; - } - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_scale_reader, - reader_tasks[i]); - kfree(reader_tasks); - } - - if (writer_tasks) { - for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_scale_writer, - writer_tasks[i]); - if (!writer_n_durations) - continue; - j = writer_n_durations[i]; - pr_alert("%s%s writer %d gps: %d\n", - scale_type, SCALE_FLAG, i, j); - ngps += j; - } - pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - scale_type, SCALE_FLAG, - t_rcu_scale_writer_started, t_rcu_scale_writer_finished, - t_rcu_scale_writer_finished - - t_rcu_scale_writer_started, - ngps, - rcuscale_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); - for (i = 0; i < nrealwriters; i++) { - if (!writer_durations) - break; - if (!writer_n_durations) - continue; - wdpp = writer_durations[i]; - if (!wdpp) - continue; - for (j = 0; j < writer_n_durations[i]; j++) { - wdp = &wdpp[j]; - pr_alert("%s%s %4d writer-duration: %5d %llu\n", - scale_type, SCALE_FLAG, - i, j, *wdp); - if (j % 100 == 0) - schedule_timeout_uninterruptible(1); - } - kfree(writer_durations[i]); - } - kfree(writer_tasks); - kfree(writer_durations); - kfree(writer_n_durations); - } - - /* Do torture-type-specific cleanup operations. */ - if (cur_ops->cleanup != NULL) - cur_ops->cleanup(); - - torture_cleanup_end(); -} - /* * Return the number if non-negative. If -1, the number of CPUs. * If less than -1, that much less than the number of CPUs, but @@ -625,20 +542,6 @@ static int compute_real(int n) } /* - * RCU scalability shutdown kthread. Just waits to be awakened, then shuts - * down system. - */ -static int -rcu_scale_shutdown(void *arg) -{ - wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); - smp_mb(); /* Wake before output. */ - rcu_scale_cleanup(); - kernel_power_off(); - return -EINVAL; -} - -/* * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number * of iterations and measure total time and number of GP for all iterations to complete. */ @@ -874,6 +777,108 @@ unwind: return firsterr; } +static void +rcu_scale_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + + if (kfree_rcu_test) { + kfree_scale_cleanup(); + return; + } + + if (torture_cleanup_begin()) + return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_scale_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_scale_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + scale_type, SCALE_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, + ngps, + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j < writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + scale_type, SCALE_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do torture-type-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_scale_shutdown(void *arg) +{ + wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_scale_cleanup(); + kernel_power_off(); + return -EINVAL; +} + static int __init rcu_scale_init(void) { diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5f4fc8184dd0..b770add3f843 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -241,7 +241,6 @@ static void cblist_init_generic(struct rcu_tasks *rtp) if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; rcu_task_cb_adjust = true; - pr_info("%s: Setting adjustable number of callback queues.\n", __func__); } else if (rcu_task_enqueue_lim == 0) { rcu_task_enqueue_lim = 1; } @@ -272,7 +271,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp) raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); - pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); + + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, + data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); } // IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). @@ -463,6 +464,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu { int cpu; int cpunext; + int cpuwq; unsigned long flags; int len; struct rcu_head *rhp; @@ -473,11 +475,13 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu cpunext = cpu * 2 + 1; if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); cpunext++; if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); } } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f52ff7241041..1449cb69a0e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2046,19 +2046,35 @@ rcu_check_quiescent_state(struct rcu_data *rdp) rcu_report_qs_rdp(rdp); } +/* Return true if callback-invocation time limit exceeded. */ +static bool rcu_do_batch_check_time(long count, long tlimit, + bool jlimit_check, unsigned long jlimit) +{ + // Invoke local_clock() only once per 32 consecutive callbacks. + return unlikely(tlimit) && + (!likely(count & 31) || + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) && + jlimit_check && time_after(jiffies, jlimit))) && + local_clock() >= tlimit; +} + /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Throttle as specified by rdp->blimit. */ static void rcu_do_batch(struct rcu_data *rdp) { + long bl; + long count = 0; int div; bool __maybe_unused empty; unsigned long flags; - struct rcu_head *rhp; + unsigned long jlimit; + bool jlimit_check = false; + long pending; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count = 0; - long pending, tlimit = 0; + struct rcu_head *rhp; + long tlimit = 0; /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { @@ -2082,11 +2098,15 @@ static void rcu_do_batch(struct rcu_data *rdp) div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (in_serving_softirq() && unlikely(bl > 100)) { + if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) && + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) { + const long npj = NSEC_PER_SEC / HZ; long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; tlimit = local_clock() + rrn; + jlimit = jiffies + (rrn + npj + 1) / npj; + jlimit_check = true; } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); @@ -2126,21 +2146,23 @@ static void rcu_do_batch(struct rcu_data *rdp) * Make sure we don't spend too much time here and deprive other * softirq vectors of CPU cycles. */ - if (unlikely(tlimit)) { - /* only call local_clock() every 32 callbacks */ - if (likely((count & 31) || local_clock() < tlimit)) - continue; - /* Exceeded the time limit, so leave. */ + if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) break; - } } else { - // In rcuoc context, so no worries about depriving - // other softirq vectors of CPU cycles. + // In rcuc/rcuoc context, so no worries about + // depriving other softirq vectors of CPU cycles. local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); lockdep_assert_irqs_enabled(); local_bh_disable(); + // But rcuc kthreads can delay quiescent-state + // reporting, so check time limits for them. + if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING && + rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) { + rdp->rcu_cpu_has_work = 1; + break; + } } } @@ -2459,12 +2481,12 @@ static void rcu_cpu_kthread(unsigned int cpu) *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); work = *workp; - *workp = 0; + WRITE_ONCE(*workp, 0); local_irq_enable(); if (work) rcu_core(); local_bh_enable(); - if (*workp == 0) { + if (!READ_ONCE(*workp)) { trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); *statusp = RCU_KTHREAD_WAITING; return; @@ -2756,7 +2778,7 @@ EXPORT_SYMBOL_GPL(call_rcu); */ struct kvfree_rcu_bulk_data { struct list_head list; - unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap; unsigned long nr_records; void *records[]; }; @@ -2773,6 +2795,7 @@ struct kvfree_rcu_bulk_data { * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period + * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ @@ -2780,6 +2803,7 @@ struct kvfree_rcu_bulk_data { struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; + struct rcu_gp_oldstate head_free_gp_snap; struct list_head bulk_head_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; @@ -2900,6 +2924,9 @@ drain_page_cache(struct kfree_rcu_cpu *krcp) struct llist_node *page_list, *pos, *n; int freed = 0; + if (!rcu_min_cached_objs) + return 0; + raw_spin_lock_irqsave(&krcp->lock, flags); page_list = llist_del_all(&krcp->bkvcache); WRITE_ONCE(krcp->nr_bkv_objs, 0); @@ -2920,24 +2947,25 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, unsigned long flags; int i; - debug_rcu_bhead_unqueue(bnode); - - rcu_lock_acquire(&rcu_callback_map); - if (idx == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bnode->nr_records, - bnode->records); - - kfree_bulk(bnode->nr_records, bnode->records); - } else { // vmalloc() / vfree(). - for (i = 0; i < bnode->nr_records; i++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, bnode->records[i], 0); - - vfree(bnode->records[i]); + if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + debug_rcu_bhead_unqueue(bnode); + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, bnode->records[i], 0); + + vfree(bnode->records[i]); + } } + rcu_lock_release(&rcu_callback_map); } - rcu_lock_release(&rcu_callback_map); raw_spin_lock_irqsave(&krcp->lock, flags); if (put_cached_bnode(krcp, bnode)) @@ -2984,6 +3012,7 @@ static void kfree_rcu_work(struct work_struct *work) struct rcu_head *head; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + struct rcu_gp_oldstate head_gp_snap; int i; krwp = container_of(to_rcu_work(work), @@ -2998,6 +3027,7 @@ static void kfree_rcu_work(struct work_struct *work) // Channel 3. head = krwp->head_free; krwp->head_free = NULL; + head_gp_snap = krwp->head_free_gp_snap; raw_spin_unlock_irqrestore(&krcp->lock, flags); // Handle the first two channels. @@ -3014,7 +3044,8 @@ static void kfree_rcu_work(struct work_struct *work) * queued on a linked list through their rcu_head structures. * This list is named "Channel 3". */ - kvfree_rcu_list(head); + if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) + kvfree_rcu_list(head); } static bool @@ -3081,7 +3112,7 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) INIT_LIST_HEAD(&bulk_ready[i]); list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { - if (!poll_state_synchronize_rcu(bnode->gp_snap)) + if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) break; atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); @@ -3146,6 +3177,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // objects queued on the linked list. if (!krwp->head_free) { krwp->head_free = krcp->head; + get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); atomic_set(&krcp->head_count, 0); WRITE_ONCE(krcp->head, NULL); } @@ -3194,7 +3226,7 @@ static void fill_page_cache_func(struct work_struct *work) nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? 1 : rcu_min_cached_objs; - for (i = 0; i < nr_pages; i++) { + for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -3218,6 +3250,10 @@ static void fill_page_cache_func(struct work_struct *work) static void run_page_cache_worker(struct kfree_rcu_cpu *krcp) { + // If cache disabled, bail out. + if (!rcu_min_cached_objs) + return; + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { @@ -3272,7 +3308,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, // scenarios. bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - *krcp = krc_this_cpu_lock(flags); + raw_spin_lock_irqsave(&(*krcp)->lock, *flags); } if (!bnode) @@ -3285,7 +3321,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, // Finally insert and update the GP for this page. bnode->records[bnode->nr_records++] = ptr; - bnode->gp_snap = get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&bnode->gp_snap); atomic_inc(&(*krcp)->bulk_count[idx]); return true; @@ -4283,7 +4319,6 @@ int rcutree_prepare_cpu(unsigned int cpu) */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rdp->beenonline = true; /* We have now been online. */ rdp->gp_seq = READ_ONCE(rnp->gp_seq); rdp->gp_seq_needed = rdp->gp_seq; rdp->cpu_no_qs.b.norm = true; @@ -4311,6 +4346,16 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) } /* + * Has the specified (known valid) CPU ever been fully online? + */ +bool rcu_cpu_beenfullyonline(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return smp_load_acquire(&rdp->beenonline); +} + +/* * Near the end of the CPU-online process. Pretty much all services * enabled, and the CPU is now very much alive. */ @@ -4368,15 +4413,16 @@ int rcutree_offline_cpu(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * This incoming CPU must not have enabled interrupts yet. */ void rcu_cpu_starting(unsigned int cpu) { - unsigned long flags; unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp; bool newcpu; + lockdep_assert_irqs_disabled(); rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu_started) return; @@ -4384,7 +4430,6 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; - local_irq_save(flags); arch_spin_lock(&rcu_state.ofl_lock); rcu_dynticks_eqs_online(); raw_spin_lock(&rcu_state.barrier_lock); @@ -4403,17 +4448,17 @@ void rcu_cpu_starting(unsigned int cpu) /* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ /* rcu_report_qs_rnp() *really* wants some flags to restore */ - unsigned long flags2; + unsigned long flags; - local_irq_save(flags2); + local_irq_save(flags); rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { raw_spin_unlock_rcu_node(rnp); } arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + smp_store_release(&rdp->beenonline, true); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3b7abb58157d..8239b39d945b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -643,7 +643,7 @@ static void synchronize_rcu_expedited_wait(void) "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], - "D."[!!(rdp->cpu_no_qs.b.exp)]); + "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index f2280616f9d5..43229d2b0c44 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1319,13 +1319,22 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) int cpu; unsigned long count = 0; + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + + /* Protect rcu_nocb_mask against concurrent (de-)offloading. */ + if (!mutex_trylock(&rcu_state.barrier_mutex)) + return 0; + /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, rcu_nocb_mask) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); count += READ_ONCE(rdp->lazy_len); } + mutex_unlock(&rcu_state.barrier_mutex); + return count ? count : SHRINK_EMPTY; } @@ -1336,15 +1345,45 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long flags; unsigned long count = 0; + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + /* + * Protect against concurrent (de-)offloading. Otherwise nocb locking + * may be ignored or imbalanced. + */ + if (!mutex_trylock(&rcu_state.barrier_mutex)) { + /* + * But really don't insist if barrier_mutex is contended since we + * can't guarantee that it will never engage in a dependency + * chain involving memory allocation. The lock is seldom contended + * anyway. + */ + return 0; + } + /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, rcu_nocb_mask) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int _count = READ_ONCE(rdp->lazy_len); + int _count; + + if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp))) + continue; - if (_count == 0) + if (!READ_ONCE(rdp->lazy_len)) continue; + rcu_nocb_lock_irqsave(rdp, flags); - WRITE_ONCE(rdp->lazy_len, 0); + /* + * Recheck under the nocb lock. Since we are not holding the bypass + * lock we may still race with increments from the enqueuer but still + * we know for sure if there is at least one lazy callback. + */ + _count = READ_ONCE(rdp->lazy_len); + if (!_count) { + rcu_nocb_unlock_irqrestore(rdp, flags); + continue; + } + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp(rdp, false); sc->nr_to_scan -= _count; @@ -1352,6 +1391,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) if (sc->nr_to_scan <= 0) break; } + + mutex_unlock(&rcu_state.barrier_mutex); + return count ? count : SHRINK_STOP; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7b0fe741a088..41021080ad25 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -257,6 +257,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * GP should not be able to end until we report, so there should be * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) + * + * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change. */ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); @@ -941,7 +943,7 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (rdp->cpu_no_qs.b.exp) + if (READ_ONCE(rdp->cpu_no_qs.b.exp)) rcu_report_exp_rdp(rdp); } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index b5cc2b53464d..3c6193de9cde 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -266,7 +266,7 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) s64 delta; again: - now = sched_clock(); + now = sched_clock_noinstr(); delta = now - scd->tick_raw; if (unlikely(delta < 0)) delta = 0; @@ -287,28 +287,35 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock)) + if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -noinstr u64 local_clock(void) +noinstr u64 local_clock_noinstr(void) { u64 clock; if (static_branch_likely(&__sched_clock_stable)) - return sched_clock() + __sched_clock_offset; + return sched_clock_noinstr() + __sched_clock_offset; if (!static_branch_likely(&sched_clock_running)) - return sched_clock(); + return sched_clock_noinstr(); - preempt_disable_notrace(); clock = sched_clock_local(this_scd()); - preempt_enable_notrace(); return clock; } + +u64 local_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = local_clock_noinstr(); + preempt_enable_notrace(); + return now; +} EXPORT_SYMBOL_GPL(local_clock); static notrace u64 sched_clock_remote(struct sched_clock_data *scd) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 944c3ae39861..c52c2eba7c73 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2213,6 +2213,154 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq_clock_skip_update(rq); } +static __always_inline +int __task_state_match(struct task_struct *p, unsigned int state) +{ + if (READ_ONCE(p->__state) & state) + return 1; + +#ifdef CONFIG_PREEMPT_RT + if (READ_ONCE(p->saved_state) & state) + return -1; +#endif + return 0; +} + +static __always_inline +int task_state_match(struct task_struct *p, unsigned int state) +{ +#ifdef CONFIG_PREEMPT_RT + int match; + + /* + * Serialize against current_save_and_set_rtlock_wait_state() and + * current_restore_rtlock_saved_state(). + */ + raw_spin_lock_irq(&p->pi_lock); + match = __task_state_match(p, state); + raw_spin_unlock_irq(&p->pi_lock); + + return match; +#else + return __task_state_match(p, state); +#endif +} + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * Wait for the thread to block in any of the states set in @match_state. + * If it changes, i.e. @p might have woken up, then return zero. When we + * succeed in waiting for @p to be off its CPU, we return a positive number + * (its total switch count). If a second call a short while later returns the + * same number, the caller can be sure that @p has remained unscheduled the + * whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) +{ + int running, queued, match; + struct rq_flags rf; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_on_cpu()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_on_cpu(rq, p)) { + if (!task_state_match(p, match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &rf); + trace_sched_wait_task(p); + running = task_on_cpu(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; + if ((match = __task_state_match(p, match_state))) { + /* + * When matching on p->saved_state, consider this task + * still queued so it will wait. + */ + if (match < 0) + queued = 1; + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + } + task_rq_unlock(rq, p, &rf); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(queued)) { + ktime_t to = NSEC_PER_SEC / HZ; + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + #ifdef CONFIG_SMP static void @@ -2398,7 +2546,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, if (!is_cpu_allowed(p, dest_cpu)) return rq; - update_rq_clock(rq); rq = move_queued_task(rq, rf, p, dest_cpu); return rq; @@ -2456,10 +2603,12 @@ static int migration_cpu_stop(void *data) goto out; } - if (task_on_rq_queued(p)) + if (task_on_rq_queued(p)) { + update_rq_clock(rq); rq = __migrate_task(rq, &rf, p, arg->dest_cpu); - else + } else { p->wake_cpu = arg->dest_cpu; + } /* * XXX __migrate_task() can fail, at which point we might end @@ -3341,114 +3490,6 @@ out: } #endif /* CONFIG_NUMA_BALANCING */ -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * Wait for the thread to block in any of the states set in @match_state. - * If it changes, i.e. @p might have woken up, then return zero. When we - * succeed in waiting for @p to be off its CPU, we return a positive number - * (its total switch count). If a second call a short while later returns the - * same number, the caller can be sure that @p has remained unscheduled the - * whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) -{ - int running, queued; - struct rq_flags rf; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_on_cpu()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_on_cpu(rq, p)) { - if (!(READ_ONCE(p->__state) & match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &rf); - trace_sched_wait_task(p); - running = task_on_cpu(rq, p); - queued = task_on_rq_queued(p); - ncsw = 0; - if (READ_ONCE(p->__state) & match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &rf); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it was still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(queued)) { - ktime_t to = NSEC_PER_SEC / HZ; - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - /*** * kick_process - kick a running thread to enter/exit the kernel * @p: the to-be-kicked thread @@ -4003,15 +4044,14 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) { + int match; + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && state != TASK_RTLOCK_WAIT); } - if (READ_ONCE(p->__state) & state) { - *success = 1; - return true; - } + *success = !!(match = __task_state_match(p, state)); #ifdef CONFIG_PREEMPT_RT /* @@ -4027,12 +4067,10 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) * p::saved_state to TASK_RUNNING so any further tests will * not result in false positives vs. @success */ - if (p->saved_state & state) { + if (match < 0) p->saved_state = TASK_RUNNING; - *success = 1; - } #endif - return false; + return match > 0; } /* @@ -5632,6 +5670,9 @@ void scheduler_tick(void) perf_event_task_tick(); + if (curr->flags & PF_WQ_WORKER) + wq_worker_tick(curr); + #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); @@ -7590,6 +7631,7 @@ static int __sched_setscheduler(struct task_struct *p, int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; + bool cpuset_locked = false; /* The pi code expects interrupts enabled */ BUG_ON(pi && in_interrupt()); @@ -7639,8 +7681,14 @@ recheck: return retval; } - if (pi) - cpuset_read_lock(); + /* + * SCHED_DEADLINE bandwidth accounting relies on stable cpusets + * information. + */ + if (dl_policy(policy) || dl_policy(p->policy)) { + cpuset_locked = true; + cpuset_lock(); + } /* * Make sure no PI-waiters arrive (or leave) while we are @@ -7716,8 +7764,8 @@ change: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); - if (pi) - cpuset_read_unlock(); + if (cpuset_locked) + cpuset_unlock(); goto recheck; } @@ -7784,7 +7832,8 @@ change: task_rq_unlock(rq, p, &rf); if (pi) { - cpuset_read_unlock(); + if (cpuset_locked) + cpuset_unlock(); rt_mutex_adjust_pi(p); } @@ -7796,8 +7845,8 @@ change: unlock: task_rq_unlock(rq, p, &rf); - if (pi) - cpuset_read_unlock(); + if (cpuset_locked) + cpuset_unlock(); return retval; } @@ -9286,8 +9335,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -int task_can_attach(struct task_struct *p, - const struct cpumask *cs_effective_cpus) +int task_can_attach(struct task_struct *p) { int ret = 0; @@ -9300,21 +9348,9 @@ int task_can_attach(struct task_struct *p, * success of set_cpus_allowed_ptr() on all attached tasks * before cpus_mask may be changed. */ - if (p->flags & PF_NO_SETAFFINITY) { + if (p->flags & PF_NO_SETAFFINITY) ret = -EINVAL; - goto out; - } - - if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_effective_cpus)) { - int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus); - - if (unlikely(cpu >= nr_cpu_ids)) - return -EINVAL; - ret = dl_cpu_busy(cpu, p); - } -out: return ret; } @@ -9548,6 +9584,7 @@ void set_rq_offline(struct rq *rq) if (rq->online) { const struct sched_class *class; + update_rq_clock(rq); for_each_class(class) { if (class->rq_offline) class->rq_offline(rq); @@ -9596,7 +9633,7 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - int ret = dl_cpu_busy(cpu, NULL); + int ret = dl_bw_check_overflow(cpu); if (ret) return ret; @@ -9689,7 +9726,6 @@ int sched_cpu_deactivate(unsigned int cpu) rq_lock_irqsave(rq, &rf); if (rq->rd) { - update_rq_clock(rq); BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } @@ -11492,7 +11528,7 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) #ifdef CONFIG_SCHED_MM_CID -/** +/* * @cid_lock: Guarantee forward-progress of cid allocation. * * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock @@ -11501,7 +11537,7 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) */ DEFINE_RAW_SPINLOCK(cid_lock); -/** +/* * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. * * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e3211455b203..4492608b7d7f 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -155,10 +155,11 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, static void sugov_get_util(struct sugov_cpu *sg_cpu) { + unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu); struct rq *rq = cpu_rq(sg_cpu->cpu); sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util, FREQUENCY_UTIL, NULL); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5a9a4b81c972..58b542bf2893 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,8 @@ * Fabio Checconi <fchecconi@gmail.com> */ +#include <linux/cpuset.h> + /* * Default limits for DL period; on the top end we guard against small util * tasks still getting ridiculously long effective runtimes, on the bottom end we @@ -489,13 +491,6 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); -void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) -{ - raw_spin_lock_init(&dl_b->dl_runtime_lock); - dl_b->dl_period = period; - dl_b->dl_runtime = runtime; -} - void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); @@ -1260,43 +1255,39 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) } /* - * This function implements the GRUB accounting rule: - * according to the GRUB reclaiming algorithm, the runtime is - * not decreased as "dq = -dt", but as - * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", + * This function implements the GRUB accounting rule. According to the + * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt", + * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt", * where u is the utilization of the task, Umax is the maximum reclaimable * utilization, Uinact is the (per-runqueue) inactive utilization, computed * as the difference between the "total runqueue utilization" and the - * runqueue active utilization, and Uextra is the (per runqueue) extra + * "runqueue active utilization", and Uextra is the (per runqueue) extra * reclaimable utilization. - * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations - * multiplied by 2^BW_SHIFT, the result has to be shifted right by - * BW_SHIFT. - * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, - * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. - * Since delta is a 64 bit variable, to have an overflow its value - * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. - * So, overflow is not an issue here. + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied + * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. + * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw + * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value should be + * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is + * not an issue here. */ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { - u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ u64 u_act; - u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ /* - * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, - * we compare u_inact + rq->dl.extra_bw with - * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because - * u_inact + rq->dl.extra_bw can be larger than - * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative - * leading to wrong results) + * Instead of computing max{u, (u_max - u_inact - u_extra)}, we + * compare u_inact + u_extra with u_max - u, because u_inact + u_extra + * can be larger than u_max. So, u_max - u_inact - u_extra would be + * negative leading to wrong results. */ - if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) - u_act = u_act_min; + if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw) + u_act = dl_se->dl_bw; else - u_act = BW_UNIT - u_inact - rq->dl.extra_bw; + u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw; + u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT; return (delta * u_act) >> BW_SHIFT; } @@ -2596,6 +2587,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) task_non_contending(p); + /* + * In case a task is setscheduled out from SCHED_DEADLINE we need to + * keep track of that on its cpuset (for correct bandwidth tracking). + */ + dec_dl_tasks_cs(p); + if (!task_on_rq_queued(p)) { /* * Inactive timer is armed. However, p is leaving DEADLINE and @@ -2636,6 +2633,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); + /* + * In case a task is setscheduled to SCHED_DEADLINE we need to keep + * track of that on its cpuset (for correct bandwidth tracking). + */ + inc_dl_tasks_cs(p); + /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { add_rq_bw(&p->dl, &rq->dl); @@ -2795,12 +2798,12 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; - dl_rq->extra_bw = 1 << BW_SHIFT; + dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT; } else { dl_rq->bw_ratio = to_ratio(global_rt_runtime(), global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); - dl_rq->extra_bw = to_ratio(global_rt_period(), - global_rt_runtime()); + dl_rq->max_bw = dl_rq->extra_bw = + to_ratio(global_rt_period(), global_rt_runtime()); } } @@ -3044,26 +3047,38 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -int dl_cpu_busy(int cpu, struct task_struct *p) +enum dl_bw_request { + dl_bw_req_check_overflow = 0, + dl_bw_req_alloc, + dl_bw_req_free +}; + +static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags, cap; + unsigned long flags; struct dl_bw *dl_b; - bool overflow; + bool overflow = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cap = dl_bw_capacity(cpu); - overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); - if (!overflow && p) { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); + if (req == dl_bw_req_free) { + __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); + } else { + unsigned long cap = dl_bw_capacity(cpu); + + overflow = __dl_overflow(dl_b, cap, 0, dl_bw); + + if (req == dl_bw_req_alloc && !overflow) { + /* + * We reserve space in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); + } } raw_spin_unlock_irqrestore(&dl_b->lock, flags); @@ -3071,6 +3086,21 @@ int dl_cpu_busy(int cpu, struct task_struct *p) return overflow ? -EBUSY : 0; } + +int dl_bw_check_overflow(int cpu) +{ + return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); +} + +int dl_bw_alloc(int cpu, u64 dl_bw) +{ + return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw); +} + +void dl_bw_free(int cpu, u64 dl_bw) +{ + dl_bw_manage(dl_bw_req_free, cpu, dl_bw); +} #endif #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0b2340a79b65..066ff1c8ae4e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -777,7 +777,7 @@ static void print_cpu(struct seq_file *m, int cpu) #define P(x) \ do { \ if (sizeof(rq->x) == 4) \ - SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ + SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \ else \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ } while (0) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 373ff5f55884..a80a73909dc2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1064,6 +1064,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +static inline bool is_core_idle(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + int sibling; + + for_each_cpu(sibling, cpu_smt_mask(cpu)) { + if (cpu == sibling) + continue; + + if (!idle_cpu(sibling)) + return false; + } +#endif + + return true; +} + #ifdef CONFIG_NUMA #define NUMA_IMBALANCE_MIN 2 @@ -1700,23 +1717,6 @@ struct numa_stats { int idle_cpu; }; -static inline bool is_core_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - int sibling; - - for_each_cpu(sibling, cpu_smt_mask(cpu)) { - if (cpu == sibling) - continue; - - if (!idle_cpu(sibling)) - return false; - } -#endif - - return true; -} - struct task_numa_env { struct task_struct *p; @@ -5577,6 +5577,14 @@ static void __cfsb_csd_unthrottle(void *arg) rq_lock(rq, &rf); /* + * Iterating over the list can trigger several call to + * update_rq_clock() in unthrottle_cfs_rq(). + * Do it once and skip the potential next ones. + */ + update_rq_clock(rq); + rq_clock_start_loop_update(rq); + + /* * Since we hold rq lock we're safe from concurrent manipulation of * the CSD list. However, this RCU critical section annotates the * fact that we pair with sched_free_group_rcu(), so that we cannot @@ -5595,6 +5603,7 @@ static void __cfsb_csd_unthrottle(void *arg) rcu_read_unlock(); + rq_clock_stop_loop_update(rq); rq_unlock(rq, &rf); } @@ -6115,6 +6124,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) lockdep_assert_rq_held(rq); + /* + * The rq clock has already been updated in the + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); + rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; @@ -6137,6 +6153,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); + + rq_clock_stop_loop_update(rq); } #else /* CONFIG_CFS_BANDWIDTH */ @@ -7202,14 +7220,58 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; } -/* - * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu - * (@dst_cpu = -1) or migrated to @dst_cpu. - */ -static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +/** + * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks. + * @cpu: the CPU to get the utilization for + * @p: task for which the CPU utilization should be predicted or NULL + * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL + * @boost: 1 to enable boosting, otherwise 0 + * + * The unit of the return value must be the same as the one of CPU capacity + * so that CPU utilization can be compared with CPU capacity. + * + * CPU utilization is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on that CPU. + * It represents the amount of CPU capacity currently used by CFS tasks in + * the range [0..max CPU capacity] with max CPU capacity being the CPU + * capacity at f_max. + * + * The estimated CPU utilization is defined as the maximum between CPU + * utilization and sum of the estimated utilization of the currently + * runnable tasks on that CPU. It preserves a utilization "snapshot" of + * previously-executed tasks, which helps better deduce how busy a CPU will + * be when a long-sleeping task wakes up. The contribution to CPU utilization + * of such a task would be significantly decayed at this point of time. + * + * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization). + * CPU contention for CFS tasks can be detected by CPU runnable > CPU + * utilization. Boosting is implemented in cpu_util() so that internal + * users (e.g. EAS) can use it next to external users (e.g. schedutil), + * latter via cpu_util_cfs_boost(). + * + * CPU utilization can be higher than the current CPU capacity + * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because + * of rounding errors as well as task migrations or wakeups of new tasks. + * CPU utilization has to be capped to fit into the [0..max CPU capacity] + * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) + * could be seen as over-utilized even though CPU1 has 20% of spare CPU + * capacity. CPU utilization is allowed to overshoot current CPU capacity + * though since this is useful for predicting the CPU capacity required + * after task migrations (scheduler-driven DVFS). + * + * Return: (Boosted) (estimated) utilization for the specified CPU. + */ +static unsigned long +cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) { struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long runnable; + + if (boost) { + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + util = max(util, runnable); + } /* * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its @@ -7217,9 +7279,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) * contribution. In all the other cases @cpu is not impacted by the * migration so its util_avg is already correct. */ - if (task_cpu(p) == cpu && dst_cpu != cpu) + if (p && task_cpu(p) == cpu && dst_cpu != cpu) lsub_positive(&util, task_util(p)); - else if (task_cpu(p) != cpu && dst_cpu == cpu) + else if (p && task_cpu(p) != cpu && dst_cpu == cpu) util += task_util(p); if (sched_feat(UTIL_EST)) { @@ -7227,6 +7289,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + if (boost) + util_est = max(util_est, runnable); + /* * During wake-up @p isn't enqueued yet and doesn't contribute * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. @@ -7255,7 +7320,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) */ if (dst_cpu == cpu) util_est += _task_util_est(p); - else if (unlikely(task_on_rq_queued(p) || current == p)) + else if (p && unlikely(task_on_rq_queued(p) || current == p)) lsub_positive(&util_est, _task_util_est(p)); util = max(util, util_est); @@ -7264,6 +7329,16 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) return min(util, capacity_orig_of(cpu)); } +unsigned long cpu_util_cfs(int cpu) +{ + return cpu_util(cpu, NULL, -1, 0); +} + +unsigned long cpu_util_cfs_boost(int cpu) +{ + return cpu_util(cpu, NULL, -1, 1); +} + /* * cpu_util_without: compute cpu utilization without any contributions from *p * @cpu: the CPU which utilization is requested @@ -7281,9 +7356,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util_cfs(cpu); + p = NULL; - return cpu_util_next(cpu, p, -1); + return cpu_util(cpu, p, -1, 0); } /* @@ -7330,7 +7405,7 @@ static inline void eenv_task_busy_time(struct energy_env *eenv, * cpu_capacity. * * The contribution of the task @p for which we want to estimate the - * energy cost is removed (by cpu_util_next()) and must be calculated + * energy cost is removed (by cpu_util()) and must be calculated * separately (see eenv_task_busy_time). This ensures: * * - A stable PD utilization, no matter which CPU of that PD we want to place @@ -7351,7 +7426,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv, int cpu; for_each_cpu(cpu, pd_cpus) { - unsigned long util = cpu_util_next(cpu, p, -1); + unsigned long util = cpu_util(cpu, p, -1, 0); busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); } @@ -7375,8 +7450,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, for_each_cpu(cpu, pd_cpus) { struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; - unsigned long util = cpu_util_next(cpu, p, dst_cpu); - unsigned long cpu_util; + unsigned long util = cpu_util(cpu, p, dst_cpu, 1); + unsigned long eff_util; /* * Performance domain frequency: utilization clamping @@ -7385,8 +7460,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); + eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); + max_util = max(max_util, eff_util); } return min(max_util, eenv->cpu_cap); @@ -7521,7 +7596,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - util = cpu_util_next(cpu, p, cpu); + util = cpu_util(cpu, p, cpu, 0); cpu_cap = capacity_of(cpu); /* @@ -9331,96 +9406,61 @@ group_type group_classify(unsigned int imbalance_pct, } /** - * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks - * @dst_cpu: Destination CPU of the load balancing + * sched_use_asym_prio - Check whether asym_packing priority must be used + * @sd: The scheduling domain of the load balancing + * @cpu: A CPU + * + * Always use CPU priority when balancing load between SMT siblings. When + * balancing load between cores, it is not sufficient that @cpu is idle. Only + * use CPU priority if the whole core is idle. + * + * Returns: True if the priority of @cpu must be followed. False otherwise. + */ +static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) +{ + if (!sched_smt_active()) + return true; + + return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); +} + +/** + * sched_asym - Check if the destination CPU can do asym_packing load balance + * @env: The load balancing environment * @sds: Load-balancing data with statistics of the local group * @sgs: Load-balancing statistics of the candidate busiest group - * @sg: The candidate busiest group + * @group: The candidate busiest group * - * Check the state of the SMT siblings of both @sds::local and @sg and decide - * if @dst_cpu can pull tasks. + * @env::dst_cpu can do asym_packing if it has higher priority than the + * preferred CPU of @group. * - * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of - * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks - * only if @dst_cpu has higher priority. + * SMT is a special case. If we are balancing load between cores, @env::dst_cpu + * can do asym_packing balance only if all its SMT siblings are idle. Also, it + * can only do it if @group is an SMT group and has exactly on busy CPU. Larger + * imbalances in the number of CPUS are dealt with in find_busiest_group(). * - * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more - * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. - * Bigger imbalances in the number of busy CPUs will be dealt with in - * update_sd_pick_busiest(). + * If we are balancing load within an SMT core, or at DIE domain level, always + * proceed. * - * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings - * of @dst_cpu are idle and @sg has lower priority. - * - * Return: true if @dst_cpu can pull tasks, false otherwise. + * Return: true if @env::dst_cpu can do with asym_packing load balance. False + * otherwise. */ -static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, - struct sg_lb_stats *sgs, - struct sched_group *sg) +static inline bool +sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, + struct sched_group *group) { -#ifdef CONFIG_SCHED_SMT - bool local_is_smt, sg_is_smt; - int sg_busy_cpus; - - local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; - sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; - - sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; - - if (!local_is_smt) { - /* - * If we are here, @dst_cpu is idle and does not have SMT - * siblings. Pull tasks if candidate group has two or more - * busy CPUs. - */ - if (sg_busy_cpus >= 2) /* implies sg_is_smt */ - return true; - - /* - * @dst_cpu does not have SMT siblings. @sg may have SMT - * siblings and only one is busy. In such case, @dst_cpu - * can help if it has higher priority and is idle (i.e., - * it has no running tasks). - */ - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - } - - /* @dst_cpu has SMT siblings. */ - - if (sg_is_smt) { - int local_busy_cpus = sds->local->group_weight - - sds->local_stat.idle_cpus; - int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; - - if (busy_cpus_delta == 1) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - + /* Ensure that the whole local core is idle, if applicable. */ + if (!sched_use_asym_prio(env->sd, env->dst_cpu)) return false; - } /* - * @sg does not have SMT siblings. Ensure that @sds::local does not end - * up with more than one busy SMT sibling and only pull tasks if there - * are not busy CPUs (i.e., no CPU has running tasks). + * CPU priorities does not make sense for SMT cores with more than one + * busy sibling. */ - if (!sds->local_stat.sum_nr_running) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); - - return false; -#else - /* Always return false so that callers deal with non-SMT cases. */ - return false; -#endif -} - -static inline bool -sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, - struct sched_group *group) -{ - /* Only do SMT checks if either local or candidate have SMT siblings */ - if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || - (group->flags & SD_SHARE_CPUCAPACITY)) - return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); + if (group->flags & SD_SHARE_CPUCAPACITY) { + if (sgs->group_weight - sgs->idle_cpus != 1) + return false; + } return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); } @@ -9610,10 +9650,22 @@ static bool update_sd_pick_busiest(struct lb_env *env, * contention when accessing shared HW resources. * * XXX for now avg_load is not computed and always 0 so we - * select the 1st one. + * select the 1st one, except if @sg is composed of SMT + * siblings. */ - if (sgs->avg_load <= busiest->avg_load) + + if (sgs->avg_load < busiest->avg_load) return false; + + if (sgs->avg_load == busiest->avg_load) { + /* + * SMT sched groups need more help than non-SMT groups. + * If @sg happens to also be SMT, either choice is good. + */ + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) + return false; + } + break; case group_has_spare: @@ -10088,7 +10140,6 @@ static void update_idle_cpu_scan(struct lb_env *env, static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; @@ -10129,8 +10180,13 @@ next_group: sg = sg->next; } while (sg != env->sd->groups); - /* Tag domain that child domain prefers tasks go to siblings first */ - sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + /* + * Indicate that the child domain of the busiest group prefers tasks + * go to a child's sibling domains first. NB the flags of a sched group + * are those of the child domain. + */ + if (sds->busiest) + sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING); if (env->sd->flags & SD_NUMA) @@ -10440,7 +10496,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto out_balanced; } - /* Try to move all excess tasks to child's sibling domain */ + /* + * Try to move all excess tasks to a sibling domain of the busiest + * group's child domain. + */ if (sds.prefer_sibling && local->group_type == group_has_spare && busiest->sum_nr_running > local->sum_nr_running + 1) goto force_balance; @@ -10542,8 +10601,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, nr_running == 1) continue; - /* Make sure we only pull tasks from a CPU of lower priority */ + /* + * Make sure we only pull tasks from a CPU of lower priority + * when balancing between SMT siblings. + * + * If balancing between cores, let lower priority CPUs help + * SMT cores with more than one busy sibling. + */ if ((env->sd->flags & SD_ASYM_PACKING) && + sched_use_asym_prio(env->sd, i) && sched_asym_prefer(i, env->dst_cpu) && nr_running == 1) continue; @@ -10581,7 +10647,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, break; case migrate_util: - util = cpu_util_cfs(i); + util = cpu_util_cfs_boost(i); /* * Don't try to pull utilization from a CPU with one @@ -10632,12 +10698,19 @@ static inline bool asym_active_balance(struct lb_env *env) { /* - * ASYM_PACKING needs to force migrate tasks from busy but - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. + * ASYM_PACKING needs to force migrate tasks from busy but lower + * priority CPUs in order to pack all tasks in the highest priority + * CPUs. When done between cores, do it only if the whole core if the + * whole core is idle. + * + * If @env::src_cpu is an SMT core with busy siblings, let + * the lower priority @env::dst_cpu help it. Do not follow + * CPU priority. */ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu); + sched_use_asym_prio(env->sd, env->dst_cpu) && + (sched_asym_prefer(env->dst_cpu, env->src_cpu) || + !sched_use_asym_prio(env->sd, env->src_cpu)); } static inline bool @@ -10744,7 +10817,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_span(sd->groups), + .dst_grpmask = group_balance_mask(sd->groups), .idle = idle, .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, @@ -11371,9 +11444,13 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_PACKING; see if there's a more preferred CPU * currently idle; in which case, kick the ILB to move tasks * around. + * + * When balancing betwen cores, all the SMT siblings of the + * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { - if (sched_asym_prefer(i, cpu)) { + if (sched_use_asym_prio(sd, i) && + sched_asym_prefer(i, cpu)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e072f6b31bf3..81fca77397f6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -160,7 +160,6 @@ __setup("psi=", setup_psi); #define EXP_300s 2034 /* 1/exp(2s/300s) */ /* PSI trigger definitions */ -#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ #define WINDOW_MAX_US 10000000 /* Max window size is 10s */ #define UPDATES_PER_WINDOW 10 /* 10 updates per window */ @@ -1305,8 +1304,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); - if (window_us < WINDOW_MIN_US || - window_us > WINDOW_MAX_US) + if (window_us == 0 || window_us > WINDOW_MAX_US) return ERR_PTR(-EINVAL); /* @@ -1409,11 +1407,16 @@ void psi_trigger_destroy(struct psi_trigger *t) group->rtpoll_nr_triggers[t->state]--; if (!group->rtpoll_nr_triggers[t->state]) group->rtpoll_states &= ~(1 << t->state); - /* reset min update period for the remaining triggers */ - list_for_each_entry(tmp, &group->rtpoll_triggers, node) - period = min(period, div_u64(tmp->win.size, - UPDATES_PER_WINDOW)); - group->rtpoll_min_period = period; + /* + * Reset min update period for the remaining triggers + * iff the destroying trigger had the min window size. + */ + if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) { + list_for_each_entry(tmp, &group->rtpoll_triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->rtpoll_min_period = period; + } /* Destroy rtpoll_task when the last trigger is destroyed */ if (group->rtpoll_states == 0) { group->rtpoll_until = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec7b3e0a2b20..e93e006a942b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -286,12 +286,6 @@ struct rt_bandwidth { void __dl_clear_params(struct task_struct *p); -struct dl_bandwidth { - raw_spinlock_t dl_runtime_lock; - u64 dl_runtime; - u64 dl_period; -}; - static inline int dl_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -330,7 +324,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int dl_cpu_busy(int cpu, struct task_struct *p); +extern int dl_bw_check_overflow(int cpu); #ifdef CONFIG_CGROUP_SCHED @@ -754,6 +748,12 @@ struct dl_rq { u64 extra_bw; /* + * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM + * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB). + */ + u64 max_bw; + + /* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm. */ @@ -1546,6 +1546,28 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq) rq->clock_update_flags &= ~RQCF_REQ_SKIP; } +/* + * During cpu offlining and rq wide unthrottling, we can trigger + * an update_rq_clock() for several cfs and rt runqueues (Typically + * when using list_for_each_entry_*) + * rq_clock_start_loop_update() can be called after updating the clock + * once and before iterating over the list to prevent multiple update. + * After the iterative traversal, we need to call rq_clock_stop_loop_update() + * to clear RQCF_ACT_SKIP of rq->clock_update_flags. + */ +static inline void rq_clock_start_loop_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + rq->clock_update_flags |= RQCF_ACT_SKIP; +} + +static inline void rq_clock_stop_loop_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags &= ~RQCF_ACT_SKIP; +} + struct rq_flags { unsigned long flags; struct pin_cookie cookie; @@ -1772,6 +1794,13 @@ queue_balance_callback(struct rq *rq, for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) +/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) | +static const unsigned int SD_SHARED_CHILD_MASK = +#include <linux/sched/sd_flags.h> +0; +#undef SD_FLAG + /** * highest_flag_domain - Return highest sched_domain containing flag. * @cpu: The CPU whose highest level of sched domain is to @@ -1779,16 +1808,25 @@ queue_balance_callback(struct rq *rq, * @flag: The flag to check for the highest sched_domain * for the given CPU. * - * Returns the highest sched_domain of a CPU which contains the given flag. + * Returns the highest sched_domain of a CPU which contains @flag. If @flag has + * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag. */ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) { struct sched_domain *sd, *hsd = NULL; for_each_domain(cpu, sd) { - if (!(sd->flags & flag)) + if (sd->flags & flag) { + hsd = sd; + continue; + } + + /* + * Stop the search if @flag is known to be shared at lower + * levels. It will not be found further up. + */ + if (flag & SD_SHARED_CHILD_MASK) break; - hsd = sd; } return hsd; @@ -2378,7 +2416,6 @@ extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); @@ -2946,53 +2983,9 @@ static inline unsigned long cpu_util_dl(struct rq *rq) return READ_ONCE(rq->avg_dl.util_avg); } -/** - * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks. - * @cpu: the CPU to get the utilization for. - * - * The unit of the return value must be the same as the one of CPU capacity - * so that CPU utilization can be compared with CPU capacity. - * - * CPU utilization is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on that CPU. - * It represents the amount of CPU capacity currently used by CFS tasks in - * the range [0..max CPU capacity] with max CPU capacity being the CPU - * capacity at f_max. - * - * The estimated CPU utilization is defined as the maximum between CPU - * utilization and sum of the estimated utilization of the currently - * runnable tasks on that CPU. It preserves a utilization "snapshot" of - * previously-executed tasks, which helps better deduce how busy a CPU will - * be when a long-sleeping task wakes up. The contribution to CPU utilization - * of such a task would be significantly decayed at this point of time. - * - * CPU utilization can be higher than the current CPU capacity - * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because - * of rounding errors as well as task migrations or wakeups of new tasks. - * CPU utilization has to be capped to fit into the [0..max CPU capacity] - * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) - * could be seen as over-utilized even though CPU1 has 20% of spare CPU - * capacity. CPU utilization is allowed to overshoot current CPU capacity - * though since this is useful for predicting the CPU capacity required - * after task migrations (scheduler-driven DVFS). - * - * Return: (Estimated) utilization for the specified CPU. - */ -static inline unsigned long cpu_util_cfs(int cpu) -{ - struct cfs_rq *cfs_rq; - unsigned long util; - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - if (sched_feat(UTIL_EST)) { - util = max_t(unsigned long, util, - READ_ONCE(cfs_rq->avg.util_est.enqueued)); - } - return min(util, capacity_orig_of(cpu)); -} +extern unsigned long cpu_util_cfs(int cpu); +extern unsigned long cpu_util_cfs_boost(int cpu); static inline unsigned long cpu_util_rt(struct rq *rq) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6682535e37c8..d3a3b2646ec4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -487,9 +487,9 @@ static void free_rootdomain(struct rcu_head *rcu) void rq_attach_root(struct rq *rq, struct root_domain *rd) { struct root_domain *old_rd = NULL; - unsigned long flags; + struct rq_flags rf; - raw_spin_rq_lock_irqsave(rq, flags); + rq_lock_irqsave(rq, &rf); if (rq->rd) { old_rd = rq->rd; @@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - raw_spin_rq_unlock_irqrestore(rq, flags); + rq_unlock_irqrestore(rq, &rf); if (old_rd) call_rcu(&old_rd->rcu, free_rootdomain); @@ -719,8 +719,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; - if (parent->parent) + + if (parent->parent) { parent->parent->child = tmp; + if (tmp->flags & SD_SHARE_CPUCAPACITY) + parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY; + } + /* * Transfer SD_PREFER_SIBLING down in case of a * degenerate parent; the spans match for this @@ -1676,7 +1681,7 @@ static struct sched_domain_topology_level *sched_domain_topology_saved; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) -void set_sched_topology(struct sched_domain_topology_level *tl) +void __init set_sched_topology(struct sched_domain_topology_level *tl) { if (WARN_ON_ONCE(sched_smp_initialized)) return; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 133b74730738..48c53e4739ea 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -425,11 +425,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i } EXPORT_SYMBOL(autoremove_wake_function); -static inline bool is_kthread_should_stop(void) -{ - return (current->flags & PF_KTHREAD) && kthread_should_stop(); -} - /* * DEFINE_WAIT_FUNC(wait, woken_wake_func); * @@ -459,7 +454,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) * or woken_wake_function() sees our store to current->state. */ set_current_state(mode); /* A */ - if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); diff --git a/kernel/signal.c b/kernel/signal.c index 8f6330f0e9ca..b5370fe5c198 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -45,6 +45,7 @@ #include <linux/posix-timers.h> #include <linux/cgroup.h> #include <linux/audit.h> +#include <linux/sysctl.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -1368,7 +1369,9 @@ int zap_other_threads(struct task_struct *p) while_each_thread(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - count++; + /* Don't require de_thread to wait for the vhost_worker */ + if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER) + count++; /* Don't bother with already dead threads */ if (t->exit_state) @@ -2861,11 +2864,11 @@ relock: } /* - * PF_IO_WORKER threads will catch and exit on fatal signals + * PF_USER_WORKER threads will catch and exit on fatal signals * themselves. They have cleanup that must be performed, so * we cannot call do_exit() on their behalf. */ - if (current->flags & PF_IO_WORKER) + if (current->flags & PF_USER_WORKER) goto out; /* @@ -4771,6 +4774,28 @@ static inline void siginfo_buildtime_checks(void) #endif } +#if defined(CONFIG_SYSCTL) +static struct ctl_table signal_debug_table[] = { +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE + { + .procname = "exception-trace", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { } +}; + +static int __init init_signal_sysctls(void) +{ + register_sysctl_init("debug", signal_debug_table); + return 0; +} +early_initcall(init_signal_sysctls); +#endif /* CONFIG_SYSCTL */ + void __init signals_init(void) { siginfo_buildtime_checks(); diff --git a/kernel/smp.c b/kernel/smp.c index ab3e5dad6cfe..385179dae360 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -27,6 +27,9 @@ #include <linux/jump_label.h> #include <trace/events/ipi.h> +#define CREATE_TRACE_POINTS +#include <trace/events/csd.h> +#undef CREATE_TRACE_POINTS #include "smpboot.h" #include "sched/smp.h" @@ -121,6 +124,14 @@ send_call_function_ipi_mask(struct cpumask *mask) arch_send_call_function_ipi_mask(mask); } +static __always_inline void +csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd) +{ + trace_csd_function_entry(func, csd); + func(info); + trace_csd_function_exit(func, csd); +} + #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled); @@ -329,7 +340,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) * even if we haven't sent the smp_call IPI yet (e.g. the stopper * executes migration_cpu_stop() on the remote CPU). */ - if (trace_ipi_send_cpu_enabled()) { + if (trace_csd_queue_cpu_enabled()) { call_single_data_t *csd; smp_call_func_t func; @@ -337,7 +348,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? sched_ttwu_pending : csd->func; - trace_ipi_send_cpu(cpu, _RET_IP_, func); + trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); } /* @@ -375,7 +386,7 @@ static int generic_exec_single(int cpu, struct __call_single_data *csd) csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); - func(info); + csd_do_func(func, info, NULL); csd_lock_record(NULL); local_irq_restore(flags); return 0; @@ -477,7 +488,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) } csd_lock_record(csd); - func(info); + csd_do_func(func, info, csd); csd_unlock(csd); csd_lock_record(NULL); } else { @@ -508,7 +519,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) csd_lock_record(csd); csd_unlock(csd); - func(info); + csd_do_func(func, info, csd); csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); @@ -522,8 +533,10 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) /* * Third; only CSD_TYPE_TTWU is left, issue those. */ - if (entry) - sched_ttwu_pending(entry); + if (entry) { + csd = llist_entry(entry, typeof(*csd), node.llist); + csd_do_func(sched_ttwu_pending, entry, csd); + } } @@ -728,7 +741,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; bool wait = scf_flags & SCF_WAIT; - int nr_cpus = 0, nr_queued = 0; + int nr_cpus = 0; bool run_remote = false; bool run_local = false; @@ -786,22 +799,16 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif + trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); + if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; last_cpu = cpu; } - nr_queued++; } /* - * Trace each smp_function_call_*() as an IPI, actual IPIs - * will be traced with func==generic_smp_call_function_single_ipi(). - */ - if (nr_queued) - trace_ipi_send_cpumask(cfd->cpumask, _RET_IP_, func); - - /* * Choose the most efficient way to send an IPI. Note that the * number of CPUs might be zero due to concurrent changes to the * provided mask. @@ -816,7 +823,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, unsigned long flags; local_irq_save(flags); - func(info); + csd_do_func(func, info, NULL); local_irq_restore(flags); } @@ -892,7 +899,7 @@ EXPORT_SYMBOL(setup_max_cpus); * SMP mode to <NUM>. */ -void __weak arch_disable_smp_support(void) { } +void __weak __init arch_disable_smp_support(void) { } static int __init nosmp(char *str) { diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 2c7396da470c..f47d8f375946 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -325,166 +325,3 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) cpus_read_unlock(); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); - -static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); - -/* - * Called to poll specified CPU's state, for example, when waiting for - * a CPU to come online. - */ -int cpu_report_state(int cpu) -{ - return atomic_read(&per_cpu(cpu_hotplug_state, cpu)); -} - -/* - * If CPU has died properly, set its state to CPU_UP_PREPARE and - * return success. Otherwise, return -EBUSY if the CPU died after - * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN - * if cpu_wait_death() timed out and the CPU still hasn't gotten around - * to dying. In the latter two cases, the CPU might not be set up - * properly, but it is up to the arch-specific code to decide. - * Finally, -EIO indicates an unanticipated problem. - * - * Note that it is permissible to omit this call entirely, as is - * done in architectures that do no CPU-hotplug error checking. - */ -int cpu_check_up_prepare(int cpu) -{ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) { - atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); - return 0; - } - - switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) { - - case CPU_POST_DEAD: - - /* The CPU died properly, so just start it up again. */ - atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); - return 0; - - case CPU_DEAD_FROZEN: - - /* - * Timeout during CPU death, so let caller know. - * The outgoing CPU completed its processing, but after - * cpu_wait_death() timed out and reported the error. The - * caller is free to proceed, in which case the state - * will be reset properly by cpu_set_state_online(). - * Proceeding despite this -EBUSY return makes sense - * for systems where the outgoing CPUs take themselves - * offline, with no post-death manipulation required from - * a surviving CPU. - */ - return -EBUSY; - - case CPU_BROKEN: - - /* - * The most likely reason we got here is that there was - * a timeout during CPU death, and the outgoing CPU never - * did complete its processing. This could happen on - * a virtualized system if the outgoing VCPU gets preempted - * for more than five seconds, and the user attempts to - * immediately online that same CPU. Trying again later - * might return -EBUSY above, hence -EAGAIN. - */ - return -EAGAIN; - - case CPU_UP_PREPARE: - /* - * Timeout while waiting for the CPU to show up. Allow to try - * again later. - */ - return 0; - - default: - - /* Should not happen. Famous last words. */ - return -EIO; - } -} - -/* - * Mark the specified CPU online. - * - * Note that it is permissible to omit this call entirely, as is - * done in architectures that do no CPU-hotplug error checking. - */ -void cpu_set_state_online(int cpu) -{ - (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE); -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Wait for the specified CPU to exit the idle loop and die. - */ -bool cpu_wait_death(unsigned int cpu, int seconds) -{ - int jf_left = seconds * HZ; - int oldstate; - bool ret = true; - int sleep_jf = 1; - - might_sleep(); - - /* The outgoing CPU will normally get done quite quickly. */ - if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) - goto update_state_early; - udelay(5); - - /* But if the outgoing CPU dawdles, wait increasingly long times. */ - while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) { - schedule_timeout_uninterruptible(sleep_jf); - jf_left -= sleep_jf; - if (jf_left <= 0) - break; - sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); - } -update_state_early: - oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); -update_state: - if (oldstate == CPU_DEAD) { - /* Outgoing CPU died normally, update state. */ - smp_mb(); /* atomic_read() before update. */ - atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); - } else { - /* Outgoing CPU still hasn't died, set state accordingly. */ - if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - &oldstate, CPU_BROKEN)) - goto update_state; - ret = false; - } - return ret; -} - -/* - * Called by the outgoing CPU to report its successful death. Return - * false if this report follows the surviving CPU's timing out. - * - * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU - * timed out. This approach allows architectures to omit calls to - * cpu_check_up_prepare() and cpu_set_state_online() without defeating - * the next cpu_wait_death()'s polling loop. - */ -bool cpu_report_death(void) -{ - int oldstate; - int newstate; - int cpu = smp_processor_id(); - - oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); - do { - if (oldstate != CPU_BROKEN) - newstate = CPU_DEAD; - else - newstate = CPU_DEAD_FROZEN; - } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - &oldstate, newstate)); - return newstate == CPU_DEAD; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 1b725510dd0f..807b34ccd797 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -80,21 +80,6 @@ static void wakeup_softirqd(void) wake_up_process(tsk); } -/* - * If ksoftirqd is scheduled, we do not want to process pending softirqs - * right now. Let ksoftirqd handle this at its own rate, to get fairness, - * unless we're doing some of the synchronous softirqs. - */ -#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) -static bool ksoftirqd_running(unsigned long pending) -{ - struct task_struct *tsk = __this_cpu_read(ksoftirqd); - - if (pending & SOFTIRQ_NOW_MASK) - return false; - return tsk && task_is_running(tsk) && !__kthread_should_park(tsk); -} - #ifdef CONFIG_TRACE_IRQFLAGS DEFINE_PER_CPU(int, hardirqs_enabled); DEFINE_PER_CPU(int, hardirq_context); @@ -236,7 +221,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) goto out; pending = local_softirq_pending(); - if (!pending || ksoftirqd_running(pending)) + if (!pending) goto out; /* @@ -432,9 +417,6 @@ static inline bool should_wake_ksoftirqd(void) static inline void invoke_softirq(void) { - if (ksoftirqd_running(local_softirq_pending())) - return; - if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* @@ -468,7 +450,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending && !ksoftirqd_running(pending)) + if (pending) do_softirq_own_stack(); local_irq_restore(flags); diff --git a/kernel/sys.c b/kernel/sys.c index 339fee3eff6a..05f838929e72 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -140,6 +140,12 @@ #ifndef GET_TAGGED_ADDR_CTRL # define GET_TAGGED_ADDR_CTRL() (-EINVAL) #endif +#ifndef RISCV_V_SET_CONTROL +# define RISCV_V_SET_CONTROL(a) (-EINVAL) +#endif +#ifndef RISCV_V_GET_CONTROL +# define RISCV_V_GET_CONTROL() (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2708,6 +2714,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); break; #endif + case PR_RISCV_V_SET_CONTROL: + error = RISCV_V_SET_CONTROL(arg2); + break; + case PR_RISCV_V_GET_CONTROL: + error = RISCV_V_GET_CONTROL(); + break; default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 860b2dcf3ac4..04bfb1e4d377 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -299,6 +299,7 @@ COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); COND_SYSCALL(set_mempolicy_home_node); +COND_SYSCALL(cachestat); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bfe53e835524..354a2d294f52 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1783,11 +1783,6 @@ static struct ctl_table kern_table[] = { .proc_handler = sysctl_max_threads, }, { - .procname = "usermodehelper", - .mode = 0555, - .child = usermodehelper_table, - }, - { .procname = "overflowuid", .data = &overflowuid, .maxlen = sizeof(int), @@ -1962,13 +1957,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_KEYS - { - .procname = "keys", - .mode = 0555, - .child = key_sysctls, - }, -#endif #ifdef CONFIG_PERF_EVENTS /* * User-space scripts rely on the existence of this file @@ -2120,13 +2108,6 @@ static struct ctl_table vm_table[] = { }, #endif { - .procname = "lowmem_reserve_ratio", - .data = &sysctl_lowmem_reserve_ratio, - .maxlen = sizeof(sysctl_lowmem_reserve_ratio), - .mode = 0644, - .proc_handler = lowmem_reserve_ratio_sysctl_handler, - }, - { .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), @@ -2136,39 +2117,6 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_FOUR, }, { - .procname = "min_free_kbytes", - .data = &min_free_kbytes, - .maxlen = sizeof(min_free_kbytes), - .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_boost_factor", - .data = &watermark_boost_factor, - .maxlen = sizeof(watermark_boost_factor), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_scale_factor", - .data = &watermark_scale_factor, - .maxlen = sizeof(watermark_scale_factor), - .mode = 0644, - .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_THREE_THOUSAND, - }, - { - .procname = "percpu_pagelist_high_fraction", - .data = &percpu_pagelist_high_fraction, - .maxlen = sizeof(percpu_pagelist_high_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, - { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, .maxlen = sizeof(sysctl_page_lock_unfairness), @@ -2223,24 +2171,6 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { - .procname = "min_unmapped_ratio", - .data = &sysctl_min_unmapped_ratio, - .maxlen = sizeof(sysctl_min_unmapped_ratio), - .mode = 0644, - .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "min_slab_ratio", - .data = &sysctl_min_slab_ratio, - .maxlen = sizeof(sysctl_min_slab_ratio), - .mode = 0644, - .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, #endif #ifdef CONFIG_SMP { @@ -2267,15 +2197,6 @@ static struct ctl_table vm_table[] = { .proc_handler = mmap_min_addr_handler, }, #endif -#ifdef CONFIG_NUMA - { - .procname = "numa_zonelist_order", - .data = &numa_zonelist_order, - .maxlen = NUMA_ZONELIST_ORDER_LEN, - .mode = 0644, - .proc_handler = numa_zonelist_order_handler, - }, -#endif #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { @@ -2331,34 +2252,10 @@ static struct ctl_table vm_table[] = { { } }; -static struct ctl_table debug_table[] = { -#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE - { - .procname = "exception-trace", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif - { } -}; - -static struct ctl_table dev_table[] = { - { } -}; - -DECLARE_SYSCTL_BASE(kernel, kern_table); -DECLARE_SYSCTL_BASE(vm, vm_table); -DECLARE_SYSCTL_BASE(debug, debug_table); -DECLARE_SYSCTL_BASE(dev, dev_table); - int __init sysctl_init_bases(void) { - register_sysctl_base(kernel); - register_sysctl_base(vm); - register_sysctl_base(debug); - register_sysctl_base(dev); + register_sysctl_init("kernel", kern_table); + register_sysctl_init("vm", vm_table); return 0; } diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 82b28ab0f328..8d9f13d847f0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -751,7 +751,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now) { - struct task_struct *task = (struct task_struct *)alarm->data; + struct task_struct *task = alarm->data; alarm->data = NULL; if (task) @@ -847,7 +847,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, struct restart_block *restart = ¤t->restart_block; struct alarm alarm; ktime_t exp; - int ret = 0; + int ret; if (!alarmtimer_get_rtcdev()) return -EOPNOTSUPP; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 91836b727cef..88cbc1181b23 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1480,7 +1480,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strlcpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str, sizeof(override_name)); mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e8c08292defc..238262e4aba7 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -164,6 +164,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base) static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __acquires(&timer->base->lock) { struct hrtimer_clock_base *base; @@ -280,6 +281,7 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base) static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; @@ -1013,6 +1015,7 @@ void hrtimers_resume_local(void) */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) + __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 808a247205a9..b924f0f096fa 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -35,20 +35,17 @@ #include "timekeeping.h" #include "posix-timers.h" -/* - * Management arrays for POSIX timers. Timers are now kept in static hash table - * with 512 entries. - * Timer ids are allocated by local routine, which selects proper hash head by - * key, constructed from current->signal address and per signal struct counter. - * This keeps timer ids unique per process, but now they can intersect between - * processes. - */ +static struct kmem_cache *posix_timers_cache; /* - * Lets keep our timers in a slab cache :-) + * Timers are managed in a hash table for lockless lookup. The hash key is + * constructed from current::signal and the timer ID and the timer is + * matched against current::signal and the timer ID when walking the hash + * bucket list. + * + * This allows checkpoint/restore to reconstruct the exact timer IDs for + * a process. */ -static struct kmem_cache *posix_timers_cache; - static DEFINE_HASHTABLE(posix_timers_hashtable, 9); static DEFINE_SPINLOCK(hash_lock); @@ -56,52 +53,12 @@ static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; -/* - * we assume that the new SIGEV_THREAD_ID shares no bits with the other - * SIGEV values. Here we put out an error if this assumption fails. - */ +/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ - ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -/* - * The timer ID is turned into a timer address by idr_find(). - * Verifying a valid ID consists of: - * - * a) checking that idr_find() returns other than -1. - * b) checking that the timer id matches the one in the timer itself. - * c) that the timer owner is in the callers thread group. - */ - -/* - * CLOCKs: The POSIX standard calls for a couple of clocks and allows us - * to implement others. This structure defines the various - * clocks. - * - * RESOLUTION: Clock resolution is used to round up timer and interval - * times, NOT to report clock times, which are reported with as - * much resolution as the system can muster. In some cases this - * resolution may depend on the underlying clock hardware and - * may not be quantifiable until run time, and only then is the - * necessary code is written. The standard says we should say - * something about this issue in the documentation... - * - * FUNCTIONS: The CLOCKs structure defines possible functions to - * handle various clock functions. - * - * The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for - * the timer. 2.) The list, it_lock, it_clock, it_id and - * it_pid fields are not modified by timer code. - * - * Permissions: It is assumed that the clock_settime() function defined - * for each clock will take care of permission checks. Some - * clocks may be set able by any user (i.e. local process - * clocks) others not. Currently the only set able clock we - * have is CLOCK_REALTIME and its high res counter part, both of - * which we beg off on and pass to do_sys_settimeofday(). - */ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); #define lock_timer(tid, flags) \ @@ -121,9 +78,9 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, { struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, - lockdep_is_held(&hash_lock)) { - if ((timer->it_signal == sig) && (timer->it_id == id)) + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + /* timer->it_signal can be set concurrently */ + if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; } return NULL; @@ -140,25 +97,30 @@ static struct k_itimer *posix_timer_by_id(timer_t id) static int posix_timer_add(struct k_itimer *timer) { struct signal_struct *sig = current->signal; - int first_free_id = sig->posix_timer_id; struct hlist_head *head; - int ret = -ENOENT; + unsigned int cnt, id; - do { + /* + * FIXME: Replace this by a per signal struct xarray once there is + * a plan to handle the resulting CRIU regression gracefully. + */ + for (cnt = 0; cnt <= INT_MAX; cnt++) { spin_lock(&hash_lock); - head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; - if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + id = sig->next_posix_timer_id; + + /* Write the next ID back. Clamp it to the positive space */ + sig->next_posix_timer_id = (id + 1) & INT_MAX; + + head = &posix_timers_hashtable[hash(sig, id)]; + if (!__posix_timers_find(head, sig, id)) { hlist_add_head_rcu(&timer->t_hash, head); - ret = sig->posix_timer_id; + spin_unlock(&hash_lock); + return id; } - if (++sig->posix_timer_id < 0) - sig->posix_timer_id = 0; - if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) - /* Loop over all possible ids completed */ - ret = -EAGAIN; spin_unlock(&hash_lock); - } while (ret == -ENOENT); - return ret; + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) @@ -166,7 +128,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) spin_unlock_irqrestore(&timr->it_lock, flags); } -/* Get clock_realtime */ static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); @@ -178,7 +139,6 @@ static ktime_t posix_get_realtime_ktime(clockid_t which_clock) return ktime_get_real(); } -/* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) { @@ -191,9 +151,6 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, return do_adjtimex(t); } -/* - * Get monotonic time for posix timers - */ static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); @@ -206,9 +163,6 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) return ktime_get(); } -/* - * Get monotonic-raw time for posix timers - */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); @@ -216,7 +170,6 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) return 0; } - static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_real_ts64(tp); @@ -267,9 +220,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } -/* - * Initialize everything, well, just everything in Posix clocks/timers ;) - */ static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", @@ -300,15 +250,9 @@ static void common_hrtimer_rearm(struct k_itimer *timr) } /* - * This function is exported for use by the signal deliver code. It is - * called just prior to the info block being released and passes that - * block to us. It's function is to update the overrun entry AND to - * restart the timer. It should only be called if the timer is to be - * restarted (i.e. we have flagged this in the sys_private entry of the - * info block). - * - * To protect against the timer going away while the interrupt is queued, - * we require that the it_requeue_pending flag be set. + * This function is called from the signal delivery code if + * info->si_sys_private is not zero, which indicates that the timer has to + * be rearmed. Restart the timer and update info::si_overrun. */ void posixtimer_rearm(struct kernel_siginfo *info) { @@ -357,18 +301,18 @@ int posix_timer_event(struct k_itimer *timr, int si_private) } /* - * This function gets called when a POSIX.1b interval timer expires. It - * is used as a callback from the kernel internal timer. The - * run_timer_list code ALWAYS calls with interrupts on. - - * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. + * This function gets called when a POSIX.1b interval timer expires from + * the HRTIMER interrupt (soft interrupt on RT kernels). + * + * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI + * based timers. */ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { + enum hrtimer_restart ret = HRTIMER_NORESTART; struct k_itimer *timr; unsigned long flags; int si_private = 0; - enum hrtimer_restart ret = HRTIMER_NORESTART; timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); @@ -379,9 +323,10 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) if (posix_timer_event(timr, si_private)) { /* - * signal was not sent because of sig_ignor - * we will not get a call back to restart it AND - * it should be restarted. + * The signal was not queued due to SIG_IGN. As a + * consequence the timer is not going to be rearmed from + * the signal delivery path. But as a real signal handler + * can be installed later the timer must be rearmed here. */ if (timr->it_interval != 0) { ktime_t now = hrtimer_cb_get_time(timer); @@ -390,34 +335,35 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) * FIXME: What we really want, is to stop this * timer completely and restart it in case the * SIG_IGN is removed. This is a non trivial - * change which involves sighand locking - * (sigh !), which we don't want to do late in - * the release cycle. + * change to the signal handling code. + * + * For now let timers with an interval less than a + * jiffie expire every jiffie and recheck for a + * valid signal handler. + * + * This avoids interrupt starvation in case of a + * very small interval, which would expire the + * timer immediately again. + * + * Moving now ahead of time by one jiffie tricks + * hrtimer_forward() to expire the timer later, + * while it still maintains the overrun accuracy + * for the price of a slight inconsistency in the + * timer_gettime() case. This is at least better + * than a timer storm. * - * For now we just let timers with an interval - * less than a jiffie expire every jiffie to - * avoid softirq starvation in case of SIG_IGN - * and a very small interval, which would put - * the timer right back on the softirq pending - * list. By moving now ahead of time we trick - * hrtimer_forward() to expire the timer - * later, while we still maintain the overrun - * accuracy, but have some inconsistency in - * the timer_gettime() case. This is at least - * better than a starved softirq. A more - * complex fix which solves also another related - * inconsistency is already in the pipeline. + * Only required when high resolution timers are + * enabled as the periodic tick based timers are + * automatically aligned to the next tick. */ -#ifdef CONFIG_HIGH_RES_TIMERS - { - ktime_t kj = NSEC_PER_SEC / HZ; + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) { + ktime_t kj = TICK_NSEC; if (timr->it_interval < kj) now = ktime_add(now, kj); } -#endif - timr->it_overrun += hrtimer_forward(timer, now, - timr->it_interval); + + timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; timr->it_active = 1; @@ -454,8 +400,8 @@ static struct pid *good_sigevent(sigevent_t * event) static struct k_itimer * alloc_posix_timer(void) { - struct k_itimer *tmr; - tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + if (!tmr) return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { @@ -473,21 +419,21 @@ static void k_itimer_rcu_free(struct rcu_head *head) kmem_cache_free(posix_timers_cache, tmr); } -#define IT_ID_SET 1 -#define IT_ID_NOT_SET 0 -static void release_posix_timer(struct k_itimer *tmr, int it_id_set) +static void posix_timer_free(struct k_itimer *tmr) { - if (it_id_set) { - unsigned long flags; - spin_lock_irqsave(&hash_lock, flags); - hlist_del_rcu(&tmr->t_hash); - spin_unlock_irqrestore(&hash_lock, flags); - } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); call_rcu(&tmr->rcu, k_itimer_rcu_free); } +static void posix_timer_unhash_and_free(struct k_itimer *tmr) +{ + spin_lock(&hash_lock); + hlist_del_rcu(&tmr->t_hash); + spin_unlock(&hash_lock); + posix_timer_free(tmr); +} + static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -501,7 +447,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; - int it_id_set = IT_ID_NOT_SET; if (!kc) return -EINVAL; @@ -513,13 +458,18 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, return -EAGAIN; spin_lock_init(&new_timer->it_lock); + + /* + * Add the timer to the hash table. The timer is not yet valid + * because new_timer::it_signal is still NULL. The timer id is also + * not yet visible to user space. + */ new_timer_id = posix_timer_add(new_timer); if (new_timer_id < 0) { - error = new_timer_id; - goto out; + posix_timer_free(new_timer); + return new_timer_id; } - it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; @@ -547,30 +497,33 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { + if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { error = -EFAULT; goto out; } - + /* + * After succesful copy out, the timer ID is visible to user space + * now but not yet valid because new_timer::signal is still NULL. + * + * Complete the initialization with the clock specific create + * callback. + */ error = kc->timer_create(new_timer); if (error) goto out; spin_lock_irq(¤t->sighand->siglock); - new_timer->it_signal = current->signal; + /* This makes the timer valid in the hash table */ + WRITE_ONCE(new_timer->it_signal, current->signal); list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); - - return 0; /* - * In the case of the timer belonging to another task, after - * the task is unlocked, the timer is owned by the other task - * and may cease to exist at any time. Don't use or modify - * new_timer after the unlock call. + * After unlocking sighand::siglock @new_timer is subject to + * concurrent removal and cannot be touched anymore */ + return 0; out: - release_posix_timer(new_timer, it_id_set); + posix_timer_unhash_and_free(new_timer); return error; } @@ -604,13 +557,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -/* - * Locking issues: We need to protect the result of the id look up until - * we get the timer locked down so it is not deleted under us. The - * removal is done under the idr spinlock so we use that here to bridge - * the find to the timer lock. To avoid a dead lock, the timer id MUST - * be release with out holding the timer lock. - */ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; @@ -622,10 +568,35 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) if ((unsigned long long)timer_id > INT_MAX) return NULL; + /* + * The hash lookup and the timers are RCU protected. + * + * Timers are added to the hash in invalid state where + * timr::it_signal == NULL. timer::it_signal is only set after the + * rest of the initialization succeeded. + * + * Timer destruction happens in steps: + * 1) Set timr::it_signal to NULL with timr::it_lock held + * 2) Release timr::it_lock + * 3) Remove from the hash under hash_lock + * 4) Call RCU for removal after the grace period + * + * Holding rcu_read_lock() accross the lookup ensures that + * the timer cannot be freed. + * + * The lookup validates locklessly that timr::it_signal == + * current::it_signal and timr::it_id == @timer_id. timr::it_id + * can't change, but timr::it_signal becomes NULL during + * destruction. + */ rcu_read_lock(); timr = posix_timer_by_id(timer_id); if (timr) { spin_lock_irqsave(&timr->it_lock, *flags); + /* + * Validate under timr::it_lock that timr::it_signal is + * still valid. Pairs with #1 above. + */ if (timr->it_signal == current->signal) { rcu_read_unlock(); return timr; @@ -652,20 +623,16 @@ static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now) } /* - * Get the time remaining on a POSIX.1b interval timer. This function - * is ALWAYS called with spin_lock_irq on the timer, thus it must not - * mess with irq. + * Get the time remaining on a POSIX.1b interval timer. * - * We have a couple of messes to clean up here. First there is the case - * of a timer that has a requeue pending. These timers should appear to - * be in the timer list with an expiry as if we were to requeue them - * now. + * Two issues to handle here: * - * The second issue is the SIGEV_NONE timer which may be active but is - * not really ever put in the timer list (to save system resources). - * This timer may be expired, and if so, we will do it here. Otherwise - * it is the same as a requeue pending timer WRT to what we should - * report. + * 1) The timer has a requeue pending. The return value must appear as + * if the timer has been requeued right now. + * + * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued + * into the hrtimer queue and therefore never expired. Emulate expiry + * here taking #1 into account. */ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { @@ -681,8 +648,12 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) cur_setting->it_interval = ktime_to_timespec64(iv); } else if (!timr->it_active) { /* - * SIGEV_NONE oneshot timers are never queued. Check them - * below. + * SIGEV_NONE oneshot timers are never queued and therefore + * timr->it_active is always false. The check below + * vs. remaining time will handle this case. + * + * For all other timers there is nothing to update here, so + * return. */ if (!sig_none) return; @@ -691,18 +662,29 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) now = kc->clock_get_ktime(timr->it_clock); /* - * When a requeue is pending or this is a SIGEV_NONE timer move the - * expiry time forward by intervals, so expiry is > now. + * If this is an interval timer and either has requeue pending or + * is a SIGEV_NONE timer move the expiry time forward by intervals, + * so expiry is > now. */ if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); - /* Return 0 only, when the timer is expired and not pending */ + /* + * As @now is retrieved before a possible timer_forward() and + * cannot be reevaluated by the compiler @remaining is based on the + * same @now value. Therefore @remaining is consistent vs. @now. + * + * Consequently all interval timers, i.e. @iv > 0, cannot have a + * remaining time <= 0 because timer_forward() guarantees to move + * them forward so that the next timer expiry is > @now. + */ if (remaining <= 0) { /* - * A single shot SIGEV_NONE timer must return 0, when - * it is expired ! + * A single shot SIGEV_NONE timer must return 0, when it is + * expired! Timers which have a real signal delivery mode + * must return a remaining time greater than 0 because the + * signal has not yet been delivered. */ if (!sig_none) cur_setting->it_value.tv_nsec = 1; @@ -711,11 +693,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) } } -/* Get the time remaining on a POSIX.1b interval timer. */ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - struct k_itimer *timr; const struct k_clock *kc; + struct k_itimer *timr; unsigned long flags; int ret = 0; @@ -765,20 +746,29 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, #endif -/* - * Get the number of overruns of a POSIX.1b interval timer. This is to - * be the overrun of the timer last delivered. At the same time we are - * accumulating overruns on the next timer. The overrun is frozen when - * the signal is delivered, either at the notify time (if the info block - * is not queued) or at the actual delivery time (as we are informed by - * the call back to posixtimer_rearm(). So all we need to do is - * to pick up the frozen overrun. +/** + * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer + * @timer_id: The timer ID which identifies the timer + * + * The "overrun count" of a timer is one plus the number of expiration + * intervals which have elapsed between the first expiry, which queues the + * signal and the actual signal delivery. On signal delivery the "overrun + * count" is calculated and cached, so it can be returned directly here. + * + * As this is relative to the last queued signal the returned overrun count + * is meaningless outside of the signal delivery path and even there it + * does not accurately reflect the current state when user space evaluates + * it. + * + * Returns: + * -EINVAL @timer_id is invalid + * 1..INT_MAX The number of overruns related to the last delivered signal */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { struct k_itimer *timr; - int overrun; unsigned long flags; + int overrun; timr = lock_timer(timer_id, &flags); if (!timr) @@ -831,10 +821,18 @@ static void common_timer_wait_running(struct k_itimer *timer) } /* - * On PREEMPT_RT this prevent priority inversion against softirq kthread in - * case it gets preempted while executing a timer callback. See comments in - * hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a - * cpu_relax(). + * On PREEMPT_RT this prevents priority inversion and a potential livelock + * against the ksoftirqd thread in case that ksoftirqd gets preempted while + * executing a hrtimer callback. + * + * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this + * just results in a cpu_relax(). + * + * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is + * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this + * prevents spinning on an eventually scheduled out task and a livelock + * when the task which tries to delete or disarm the timer has preempted + * the task which runs the expiry in task work context. */ static struct k_itimer *timer_wait_running(struct k_itimer *timer, unsigned long *flags) @@ -943,8 +941,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct __kernel_itimerspec __user *, new_setting, struct __kernel_itimerspec __user *, old_setting) { - struct itimerspec64 new_spec, old_spec; - struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; + struct itimerspec64 new_spec, old_spec, *rtn; int error = 0; if (!new_setting) @@ -953,6 +950,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (get_itimerspec64(&new_spec, new_setting)) return -EFAULT; + rtn = old_setting ? &old_spec : NULL; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old_setting) { if (put_itimerspec64(&old_spec, old_setting)) @@ -1026,38 +1024,71 @@ retry_delete: list_del(&timer->list); spin_unlock(¤t->sighand->siglock); /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). + * A concurrent lookup could check timer::it_signal lockless. It + * will reevaluate with timer::it_lock held and observe the NULL. */ - timer->it_signal = NULL; + WRITE_ONCE(timer->it_signal, NULL); unlock_timer(timer, flags); - release_posix_timer(timer, IT_ID_SET); + posix_timer_unhash_and_free(timer); return 0; } /* - * return timer owned by the process, used by exit_itimers + * Delete a timer if it is armed, remove it from the hash and schedule it + * for RCU freeing. */ static void itimer_delete(struct k_itimer *timer) { -retry_delete: - spin_lock_irq(&timer->it_lock); + unsigned long flags; + /* + * irqsave is required to make timer_wait_running() work. + */ + spin_lock_irqsave(&timer->it_lock, flags); + +retry_delete: + /* + * Even if the timer is not longer accessible from other tasks + * it still might be armed and queued in the underlying timer + * mechanism. Worse, that timer mechanism might run the expiry + * function concurrently. + */ if (timer_delete_hook(timer) == TIMER_RETRY) { - spin_unlock_irq(&timer->it_lock); + /* + * Timer is expired concurrently, prevent livelocks + * and pointless spinning on RT. + * + * timer_wait_running() drops timer::it_lock, which opens + * the possibility for another task to delete the timer. + * + * That's not possible here because this is invoked from + * do_exit() only for the last thread of the thread group. + * So no other task can access and delete that timer. + */ + if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) + return; + goto retry_delete; } list_del(&timer->list); - spin_unlock_irq(&timer->it_lock); - release_posix_timer(timer, IT_ID_SET); + /* + * Setting timer::it_signal to NULL is technically not required + * here as nothing can access the timer anymore legitimately via + * the hash table. Set it to NULL nevertheless so that all deletion + * paths are consistent. + */ + WRITE_ONCE(timer->it_signal, NULL); + + spin_unlock_irqrestore(&timer->it_lock, flags); + posix_timer_unhash_and_free(timer); } /* - * This is called by do_exit or de_thread, only when nobody else can - * modify the signal->posix_timers list. Yet we need sighand->siglock - * to prevent the race with /proc/pid/timers. + * Invoked from do_exit() when the last thread of a thread group exits. + * At that point no other task can access the timers of the dying + * task anymore. */ void exit_itimers(struct task_struct *tsk) { @@ -1067,10 +1098,12 @@ void exit_itimers(struct task_struct *tsk) if (list_empty(&tsk->signal->posix_timers)) return; + /* Protect against concurrent read via /proc/$PID/timers */ spin_lock_irq(&tsk->sighand->siglock); list_replace_init(&tsk->signal->posix_timers, &timers); spin_unlock_irq(&tsk->sighand->siglock); + /* The timers are not longer accessible via tsk::signal */ while (!list_empty(&timers)) { tmr = list_first_entry(&timers, struct k_itimer, list); itimer_delete(tmr); @@ -1089,6 +1122,10 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, if (get_timespec64(&new_tp, tp)) return -EFAULT; + /* + * Permission checks have to be done inside the clock specific + * setter callback. + */ return kc->clock_set(which_clock, &new_tp); } @@ -1139,6 +1176,79 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, return err; } +/** + * sys_clock_getres - Get the resolution of a clock + * @which_clock: The clock to get the resolution for + * @tp: Pointer to a a user space timespec64 for storage + * + * POSIX defines: + * + * "The clock_getres() function shall return the resolution of any + * clock. Clock resolutions are implementation-defined and cannot be set by + * a process. If the argument res is not NULL, the resolution of the + * specified clock shall be stored in the location pointed to by res. If + * res is NULL, the clock resolution is not returned. If the time argument + * of clock_settime() is not a multiple of res, then the value is truncated + * to a multiple of res." + * + * Due to the various hardware constraints the real resolution can vary + * wildly and even change during runtime when the underlying devices are + * replaced. The kernel also can use hardware devices with different + * resolutions for reading the time and for arming timers. + * + * The kernel therefore deviates from the POSIX spec in various aspects: + * + * 1) The resolution returned to user space + * + * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI, + * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW + * the kernel differentiates only two cases: + * + * I) Low resolution mode: + * + * When high resolution timers are disabled at compile or runtime + * the resolution returned is nanoseconds per tick, which represents + * the precision at which timers expire. + * + * II) High resolution mode: + * + * When high resolution timers are enabled the resolution returned + * is always one nanosecond independent of the actual resolution of + * the underlying hardware devices. + * + * For CLOCK_*_ALARM the actual resolution depends on system + * state. When system is running the resolution is the same as the + * resolution of the other clocks. During suspend the actual + * resolution is the resolution of the underlying RTC device which + * might be way less precise than the clockevent device used during + * running state. + * + * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution + * returned is always nanoseconds per tick. + * + * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution + * returned is always one nanosecond under the assumption that the + * underlying scheduler clock has a better resolution than nanoseconds + * per tick. + * + * For dynamic POSIX clocks (PTP devices) the resolution returned is + * always one nanosecond. + * + * 2) Affect on sys_clock_settime() + * + * The kernel does not truncate the time which is handed in to + * sys_clock_settime(). The kernel internal timekeeping is always using + * nanoseconds precision independent of the clocksource device which is + * used to read the time from. The resolution of that device only + * affects the presicion of the time returned by sys_clock_gettime(). + * + * Returns: + * 0 Success. @tp contains the resolution + * -EINVAL @which_clock is not a valid clock ID + * -EFAULT Copying the resolution to @tp faulted + * -ENODEV Dynamic POSIX clock is not backed by a device + * -EOPNOTSUPP Dynamic POSIX clock does not support getres() + */ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { @@ -1230,7 +1340,7 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, #endif /* - * nanosleep for monotonic and realtime clocks + * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI */ static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) @@ -1242,8 +1352,13 @@ static int common_nsleep(const clockid_t which_clock, int flags, which_clock); } +/* + * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME + * + * Absolute nanosleeps for these clocks are time-namespace adjusted. + */ static int common_nsleep_timens(const clockid_t which_clock, int flags, - const struct timespec64 *rqtp) + const struct timespec64 *rqtp) { ktime_t texp = timespec64_to_ktime(*rqtp); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 8464c5acc913..68d6c1190ac7 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -64,7 +64,7 @@ static struct clock_data cd ____cacheline_aligned = { .actual_read_sched_clock = jiffy_sched_clock_read, }; -static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift) { return (cyc * mult) >> shift; } @@ -77,26 +77,36 @@ notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) notrace int sched_clock_read_retry(unsigned int seq) { - return read_seqcount_latch_retry(&cd.seq, seq); + return raw_read_seqcount_latch_retry(&cd.seq, seq); } -unsigned long long notrace sched_clock(void) +unsigned long long noinstr sched_clock_noinstr(void) { - u64 cyc, res; - unsigned int seq; struct clock_read_data *rd; + unsigned int seq; + u64 cyc, res; do { - rd = sched_clock_read_begin(&seq); + seq = raw_read_seqcount_latch(&cd.seq); + rd = cd.read_data + (seq & 1); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); - } while (sched_clock_read_retry(seq)); + } while (raw_read_seqcount_latch_retry(&cd.seq, seq)); return res; } +unsigned long long notrace sched_clock(void) +{ + unsigned long long ns; + preempt_disable_notrace(); + ns = sched_clock_noinstr(); + preempt_enable_notrace(); + return ns; +} + /* * Updating the data required to read the clock. * diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 93bf2b4e47e5..771d1e040303 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -35,14 +35,15 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); #ifdef CONFIG_TICK_ONESHOT static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device); -static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); # ifdef CONFIG_HOTPLUG_CPU static void tick_broadcast_oneshot_offline(unsigned int cpu); # endif #else -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } +static inline void +tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } # ifdef CONFIG_HOTPLUG_CPU @@ -264,7 +265,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else - tick_broadcast_setup_oneshot(bc); + tick_broadcast_setup_oneshot(bc, false); ret = 1; } else { /* @@ -500,7 +501,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else - tick_broadcast_setup_oneshot(bc); + tick_broadcast_setup_oneshot(bc, false); } } out: @@ -1020,48 +1021,101 @@ static inline ktime_t tick_get_next_period(void) /** * tick_broadcast_setup_oneshot - setup the broadcast device */ -static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, + bool from_periodic) { int cpu = smp_processor_id(); + ktime_t nexttick = 0; if (!bc) return; - /* Set it up only once ! */ - if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = clockevent_state_periodic(bc); - - bc->event_handler = tick_handle_oneshot_broadcast; - + /* + * When the broadcast device was switched to oneshot by the first + * CPU handling the NOHZ change, the other CPUs will reach this + * code via hrtimer_run_queues() -> tick_check_oneshot_change() + * too. Set up the broadcast device only once! + */ + if (bc->event_handler == tick_handle_oneshot_broadcast) { /* - * We must be careful here. There might be other CPUs - * waiting for periodic broadcast. We need to set the - * oneshot_mask bits for those and program the - * broadcast device to fire. + * The CPU which switched from periodic to oneshot mode + * set the broadcast oneshot bit for all other CPUs which + * are in the general (periodic) broadcast mask to ensure + * that CPUs which wait for the periodic broadcast are + * woken up. + * + * Clear the bit for the local CPU as the set bit would + * prevent the first tick_broadcast_enter() after this CPU + * switched to oneshot state to program the broadcast + * device. + * + * This code can also be reached via tick_broadcast_control(), + * but this cannot avoid the tick_broadcast_clear_oneshot() + * as that would break the periodic to oneshot transition of + * secondary CPUs. But that's harmless as the below only + * clears already cleared bits. */ + tick_broadcast_clear_oneshot(cpu); + return; + } + + + bc->event_handler = tick_handle_oneshot_broadcast; + bc->next_event = KTIME_MAX; + + /* + * When the tick mode is switched from periodic to oneshot it must + * be ensured that CPUs which are waiting for periodic broadcast + * get their wake-up at the next tick. This is achieved by ORing + * tick_broadcast_mask into tick_broadcast_oneshot_mask. + * + * For other callers, e.g. broadcast device replacement, + * tick_broadcast_oneshot_mask must not be touched as this would + * set bits for CPUs which are already NOHZ, but not idle. Their + * next tick_broadcast_enter() would observe the bit set and fail + * to update the expiry time and the broadcast event device. + */ + if (from_periodic) { cpumask_copy(tmpmask, tick_broadcast_mask); + /* Remove the local CPU as it is obviously not idle */ cpumask_clear_cpu(cpu, tmpmask); - cpumask_or(tick_broadcast_oneshot_mask, - tick_broadcast_oneshot_mask, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask); - if (was_periodic && !cpumask_empty(tmpmask)) { - ktime_t nextevt = tick_get_next_period(); + /* + * Ensure that the oneshot broadcast handler will wake the + * CPUs which are still waiting for periodic broadcast. + */ + nexttick = tick_get_next_period(); + tick_broadcast_init_next_event(tmpmask, nexttick); - clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); - tick_broadcast_init_next_event(tmpmask, nextevt); - tick_broadcast_set_event(bc, cpu, nextevt); - } else - bc->next_event = KTIME_MAX; - } else { /* - * The first cpu which switches to oneshot mode sets - * the bit for all other cpus which are in the general - * (periodic) broadcast mask. So the bit is set and - * would prevent the first broadcast enter after this - * to program the bc device. + * If the underlying broadcast clock event device is + * already in oneshot state, then there is nothing to do. + * The device was already armed for the next tick + * in tick_handle_broadcast_periodic() */ - tick_broadcast_clear_oneshot(cpu); + if (clockevent_state_oneshot(bc)) + return; } + + /* + * When switching from periodic to oneshot mode arm the broadcast + * device for the next tick. + * + * If the broadcast device has been replaced in oneshot mode and + * the oneshot broadcast mask is not empty, then arm it to expire + * immediately in order to reevaluate the next expiring timer. + * @nexttick is 0 and therefore in the past which will cause the + * clockevent code to force an event. + * + * For both cases the programming can be avoided when the oneshot + * broadcast mask is empty. + * + * tick_broadcast_set_event() implicitly switches the broadcast + * device to oneshot state. + */ + if (!cpumask_empty(tick_broadcast_oneshot_mask)) + tick_broadcast_set_event(bc, cpu, nexttick); } /* @@ -1070,14 +1124,16 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) void tick_broadcast_switch_to_oneshot(void) { struct clock_event_device *bc; + enum tick_device_mode oldmode; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + oldmode = tick_broadcast_device.mode; tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) - tick_broadcast_setup_oneshot(bc); + tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 65b8658da829..e9138cd7a0f5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -218,19 +218,8 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - ktime_t next_p; - u32 rem; - tick_do_timer_cpu = cpu; - - next_p = ktime_get(); - div_u64_rem(next_p, TICK_NSEC, &rem); - if (rem) { - next_p -= rem; - next_p += TICK_NSEC; - } - - tick_next_period = next_p; + tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52254679ec48..4df14db4da49 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -161,8 +161,19 @@ static ktime_t tick_init_jiffy_update(void) raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ - if (last_jiffies_update == 0) + if (last_jiffies_update == 0) { + u32 rem; + + /* + * Ensure that the tick is aligned to a multiple of + * TICK_NSEC. + */ + div_u64_rem(tick_next_period, TICK_NSEC, &rem); + if (rem) + tick_next_period += TICK_NSEC - rem; + last_jiffies_update = tick_next_period; + } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); @@ -1030,7 +1041,7 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit < 10) + if (ratelimit >= 10) return false; /* On RT, softirqs handling may be waiting on some lock */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 09d594900ee0..266d02809dbb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -450,7 +450,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += fast_tk_get_delta_ns(tkr); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return now; } @@ -566,7 +566,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); delta = fast_tk_get_delta_ns(tkr); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); + } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) *mono = basem + delta; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8cf97fa4a4b3..61c541c36596 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,6 +31,9 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.rst +config HAVE_FUNCTION_GRAPH_RETVAL + bool + config HAVE_DYNAMIC_FTRACE bool help @@ -227,6 +230,18 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. +config FUNCTION_GRAPH_RETVAL + bool "Kernel Function Graph Return Value" + depends on HAVE_FUNCTION_GRAPH_RETVAL + depends on FUNCTION_GRAPH_TRACER + default n + help + Support recording and printing the function return value when + using function graph tracer. It can be helpful to locate functions + that return errors. This feature is off by default, and you can + enable it via the trace option funcgraph-retval. + See Documentation/trace/ftrace.rst + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER @@ -650,6 +665,32 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config FPROBE_EVENTS + depends on FPROBE + depends on HAVE_REGS_AND_STACK_ACCESS_API + bool "Enable fprobe-based dynamic events" + select TRACING + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + This allows user to add tracing events on the function entry and + exit via ftrace interface. The syntax is same as the kprobe events + and the kprobe events on function entry and exit will be + transparently converted to this fprobe events. + +config PROBE_EVENTS_BTF_ARGS + depends on HAVE_FUNCTION_ARG_ACCESS_API + depends on FPROBE_EVENTS || KPROBE_EVENTS + depends on DEBUG_INFO_BTF && BPF_SYSCALL + bool "Support BTF function arguments for probe events" + default y + help + The user can specify the arguments of the probe event using the names + of the arguments of the probed function, when the probe location is a + kernel function entry or a tracepoint. + This is available only if BTF (BPF Type Format) support is enabled. + config KPROBE_EVENTS depends on KPROBES depends on HAVE_REGS_AND_STACK_ACCESS_API diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c6651e16b557..64b61f67a403 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o obj-$(CONFIG_FPROBE) += fprobe.o obj-$(CONFIG_RETHOOK) += rethook.o +obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9a050e36dc6c..5f2dcabad202 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -900,13 +900,23 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) { + struct path copy; long len; char *p; if (!sz) return 0; - p = d_path(path, buf, sz); + /* + * The path pointer is verified as trusted and safe to use, + * but let's double check it's valid anyway to workaround + * potentially broken verifier. + */ + len = copy_from_kernel_nofault(©, path, sizeof(*path)); + if (len < 0) + return len; + + p = d_path(©, buf, sz); if (IS_ERR(p)) { len = PTR_ERR(p); } else { @@ -1349,9 +1359,9 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, } return verify_pkcs7_signature(data_ptr->data, - bpf_dynptr_get_size(data_ptr), + __bpf_dynptr_size(data_ptr), sig_ptr->data, - bpf_dynptr_get_size(sig_ptr), + __bpf_dynptr_size(sig_ptr), trusted_keyring->key, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); @@ -2642,7 +2652,8 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, struct pt_regs *regs, + void *data) { struct bpf_kprobe_multi_link *link; @@ -2653,7 +2664,8 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, static void kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, - struct pt_regs *regs, void *data) + unsigned long ret_ip, struct pt_regs *regs, + void *data) { struct bpf_kprobe_multi_link *link; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 218cd95bf8e4..cd2c35b1dd8f 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -236,16 +236,23 @@ static struct notifier_block ftrace_suspend_notifier = { .notifier_call = ftrace_suspend_notifier_call, }; +/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */ +struct fgraph_ret_regs; + /* * Send the trace to the ring-buffer. * @return the original return address. */ -unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs, + unsigned long frame_pointer) { struct ftrace_graph_ret trace; unsigned long ret; ftrace_pop_return_trace(&trace, &ret, frame_pointer); +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + trace.retval = fgraph_ret_regs_return_value(ret_regs); +#endif trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); /* @@ -266,6 +273,23 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/* + * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can + * leave only ftrace_return_to_handler(ret_regs). + */ +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL +unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs) +{ + return __ftrace_return_to_handler(ret_regs, + fgraph_ret_regs_frame_pointer(ret_regs)); +} +#else +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +{ + return __ftrace_return_to_handler(NULL, frame_pointer); +} +#endif + /** * ftrace_graph_get_ret_stack - return the entry of the shadow stack * @task: The task to read the shadow stack from diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 9abb3905bc8e..e4704ec26df7 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -17,42 +17,36 @@ struct fprobe_rethook_node { struct rethook_node node; unsigned long entry_ip; + unsigned long entry_parent_ip; char data[]; }; -static void fprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct fprobe_rethook_node *fpr; struct rethook_node *rh = NULL; struct fprobe *fp; void *entry_data = NULL; - int bit, ret; + int ret = 0; fp = container_of(ops, struct fprobe, ops); - if (fprobe_disabled(fp)) - return; - - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) { - fp->nmissed++; - return; - } if (fp->exit_handler) { rh = rethook_try_get(fp->rethook); if (!rh) { fp->nmissed++; - goto out; + return; } fpr = container_of(rh, struct fprobe_rethook_node, node); fpr->entry_ip = ip; + fpr->entry_parent_ip = parent_ip; if (fp->entry_data_size) entry_data = fpr->data; } if (fp->entry_handler) - ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data); + ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data); /* If entry_handler returns !0, nmissed is not counted. */ if (rh) { @@ -61,38 +55,87 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip, else rethook_hook(rh, ftrace_get_regs(fregs), true); } -out: +} + +static void fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe *fp; + int bit; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) + return; + + /* recursion detection has to go before any traceable function and + * all functions before this point should be marked as notrace + */ + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + __fprobe_handler(ip, parent_ip, ops, fregs); ftrace_test_recursion_unlock(bit); + } NOKPROBE_SYMBOL(fprobe_handler); static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { - struct fprobe *fp = container_of(ops, struct fprobe, ops); + struct fprobe *fp; + int bit; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) + return; + + /* recursion detection has to go before any traceable function and + * all functions called before this point should be marked as notrace + */ + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } if (unlikely(kprobe_running())) { fp->nmissed++; return; } + kprobe_busy_begin(); - fprobe_handler(ip, parent_ip, ops, fregs); + __fprobe_handler(ip, parent_ip, ops, fregs); kprobe_busy_end(); + ftrace_test_recursion_unlock(bit); } static void fprobe_exit_handler(struct rethook_node *rh, void *data, - struct pt_regs *regs) + unsigned long ret_ip, struct pt_regs *regs) { struct fprobe *fp = (struct fprobe *)data; struct fprobe_rethook_node *fpr; + int bit; if (!fp || fprobe_disabled(fp)) return; fpr = container_of(rh, struct fprobe_rethook_node, node); - fp->exit_handler(fp, fpr->entry_ip, regs, + /* + * we need to assure no calls to traceable functions in-between the + * end of fprobe_handler and the beginning of fprobe_exit_handler. + */ + bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + + fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs, fp->entry_data_size ? (void *)fpr->data : NULL); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(fprobe_exit_handler); @@ -305,6 +348,14 @@ int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) } EXPORT_SYMBOL_GPL(register_fprobe_syms); +bool fprobe_is_registered(struct fprobe *fp) +{ + if (!fp || (fp->ops.saved_func != fprobe_handler && + fp->ops.saved_func != fprobe_kprobe_handler)) + return false; + return true; +} + /** * unregister_fprobe() - Unregister fprobe from ftrace * @fp: A fprobe data structure to be unregistered. @@ -317,8 +368,7 @@ int unregister_fprobe(struct fprobe *fp) { int ret; - if (!fp || (fp->ops.saved_func != fprobe_handler && - fp->ops.saved_func != fprobe_kprobe_handler)) + if (!fprobe_is_registered(fp)) return -EINVAL; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 764668467155..3740aca79fe7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3861,6 +3861,9 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; + if (iter->flags & FTRACE_ITER_ADDRS) + seq_printf(m, "%lx ", rec->ip); + if (print_rec(m, rec->ip)) { /* This should only happen when a rec is disabled */ WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); @@ -3996,6 +3999,30 @@ ftrace_touched_open(struct inode *inode, struct file *file) return 0; } +static int +ftrace_avail_addrs_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ADDRS; + iter->ops = &global_ops; + + return 0; +} + /** * ftrace_regex_open - initialize function tracer filter files * @ops: The ftrace_ops that hold the hash filters @@ -5743,7 +5770,7 @@ bool ftrace_filter_param __initdata; static int __init set_ftrace_notrace(char *str) { ftrace_filter_param = true; - strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); @@ -5751,7 +5778,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { ftrace_filter_param = true; - strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); @@ -5763,14 +5790,14 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static int __init set_graph_function(char *str) { - strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_filter=", set_graph_function); static int __init set_graph_notrace_function(char *str) { - strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_notrace=", set_graph_notrace_function); @@ -5916,6 +5943,13 @@ static const struct file_operations ftrace_touched_fops = { .release = seq_release_private, }; +static const struct file_operations ftrace_avail_addrs_fops = { + .open = ftrace_avail_addrs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, @@ -6377,6 +6411,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) trace_create_file("available_filter_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_avail_fops); + trace_create_file("available_filter_functions_addrs", TRACE_MODE_READ, + d_tracer, NULL, &ftrace_avail_addrs_fops); + trace_create_file("enabled_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_enabled_fops); @@ -6569,8 +6606,8 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, continue; *value = op->trampoline; *type = 't'; - strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); - strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + strscpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strscpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); *exported = 0; return 0; } @@ -6933,7 +6970,7 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, if (off) *off = addr - found_func->ip; if (sym) - strlcpy(sym, found_func->name, KSYM_NAME_LEN); + strscpy(sym, found_func->name, KSYM_NAME_LEN); return found_func->name; } @@ -6987,8 +7024,8 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, *value = mod_func->ip; *type = 'T'; - strlcpy(name, mod_func->name, KSYM_NAME_LEN); - strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); + strscpy(name, mod_func->name, KSYM_NAME_LEN); + strscpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); *exported = 1; preempt_enable(); return 0; diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 32c3dfdb4d6a..f32ee484391a 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -288,7 +288,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, * These loops must be protected from rethook_free_rcu() because those * are accessing 'rhn->rethook'. */ - preempt_disable(); + preempt_disable_notrace(); /* * Run the handler on the shadow stack. Do not unlink the list here because @@ -301,7 +301,8 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, break; handler = READ_ONCE(rhn->rethook->handler); if (handler) - handler(rhn, rhn->rethook->data, regs); + handler(rhn, rhn->rethook->data, + correct_ret_addr, regs); if (first == node) break; @@ -321,7 +322,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, first = first->next; rethook_recycle(rhn); } - preempt_enable(); + preempt_enable_notrace(); return correct_ret_addr; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebc59781456a..b04f52e7cd28 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -60,6 +60,7 @@ */ bool ring_buffer_expanded; +#ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the @@ -75,7 +76,6 @@ static bool __read_mostly tracing_selftest_running; */ bool __read_mostly tracing_selftest_disabled; -#ifdef CONFIG_FTRACE_STARTUP_TEST void __init disable_tracing_selftest(const char *reason) { if (!tracing_selftest_disabled) { @@ -83,6 +83,9 @@ void __init disable_tracing_selftest(const char *reason) pr_info("Ftrace startup test is disabled due to %s\n", reason); } } +#else +#define tracing_selftest_running 0 +#define tracing_selftest_disabled 0 #endif /* Pipe tracepoints to printk */ @@ -196,7 +199,7 @@ static int boot_snapshot_index; static int __init set_cmdline_ftrace(char *str) { - strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = true; @@ -281,7 +284,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { - strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); @@ -291,7 +294,7 @@ static char *trace_boot_clock __initdata; static int __init set_trace_boot_clock(char *str) { - strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; return 1; } @@ -1051,7 +1054,10 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; - if (unlikely(tracing_selftest_running || tracing_disabled)) + if (unlikely(tracing_selftest_running && tr == &global_trace)) + return 0; + + if (unlikely(tracing_disabled)) return 0; alloc = sizeof(*entry) + size + 2; /* possible \n added */ @@ -2041,6 +2047,24 @@ static int run_tracer_selftest(struct tracer *type) return 0; } +static int do_run_tracer_selftest(struct tracer *type) +{ + int ret; + + /* + * Tests can take a long time, especially if they are run one after the + * other, as does happen during bootup when all the tracers are + * registered. This could cause the soft lockup watchdog to trigger. + */ + cond_resched(); + + tracing_selftest_running = true; + ret = run_tracer_selftest(type); + tracing_selftest_running = false; + + return ret; +} + static __init int init_trace_selftests(void) { struct trace_selftests *p, *n; @@ -2092,6 +2116,10 @@ static inline int run_tracer_selftest(struct tracer *type) { return 0; } +static inline int do_run_tracer_selftest(struct tracer *type) +{ + return 0; +} #endif /* CONFIG_FTRACE_STARTUP_TEST */ static void add_tracer_options(struct trace_array *tr, struct tracer *t); @@ -2127,8 +2155,6 @@ int __init register_tracer(struct tracer *type) mutex_lock(&trace_types_lock); - tracing_selftest_running = true; - for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ @@ -2157,7 +2183,7 @@ int __init register_tracer(struct tracer *type) /* store the tracer for __set_tracer_option */ type->flags->trace = type; - ret = run_tracer_selftest(type); + ret = do_run_tracer_selftest(type); if (ret < 0) goto out; @@ -2166,7 +2192,6 @@ int __init register_tracer(struct tracer *type) add_tracer_options(&global_trace, type); out: - tracing_selftest_running = false; mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) @@ -2521,7 +2546,7 @@ static void __trace_find_cmdline(int pid, char comm[]) if (map != NO_CMDLINE_MAP) { tpid = savedcmd->map_cmdline_to_pid[map]; if (tpid == pid) { - strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); return; } } @@ -3490,7 +3515,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, unsigned int trace_ctx; char *tbuffer; - if (tracing_disabled || tracing_selftest_running) + if (tracing_disabled) return 0; /* Don't pollute graph traces with trace_vprintk internals */ @@ -3538,6 +3563,9 @@ __printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { + if (tracing_selftest_running && tr == &global_trace) + return 0; + return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } @@ -5171,7 +5199,7 @@ static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .read_iter = seq_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .write = tracing_write_stub, .llseek = tracing_lseek, .release = tracing_release, @@ -5644,10 +5672,17 @@ static const char readme_msg[] = " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif -#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \ + defined(CONFIG_FPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t Format: p[:[<group>/][<event>]] <place> [<args>]\n" "\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n" +#endif +#ifdef CONFIG_FPROBE_EVENTS + "\t f[:[<group>/][<event>]] <func-name>[%return] [<args>]\n" + "\t t[:[<group>/][<event>]] <tracepoint> [<args>]\n" +#endif #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/]<event> <field> [<field>]\n" #endif @@ -5663,7 +5698,11 @@ static const char readme_msg[] = "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + "\t $stack<index>, $stack, $retval, $comm, $arg<N>, <argname>\n" +#else "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" +#endif #else "\t $stack<index>, $stack, $retval, $comm,\n" #endif @@ -5752,7 +5791,7 @@ static const char readme_msg[] = "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" - "\t can be any field, or the special string 'stacktrace'.\n" + "\t can be any field, or the special string 'common_stacktrace'.\n" "\t Compound keys consisting of up to two fields can be specified\n" "\t by the 'keys' keyword. Values must correspond to numeric\n" "\t fields. Sort keys consisting of up to two fields can be\n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..ed7906b13f09 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -148,6 +148,17 @@ struct kretprobe_trace_entry_head { unsigned long ret_ip; }; +struct fentry_trace_entry_head { + struct trace_entry ent; + unsigned long ip; +}; + +struct fexit_trace_entry_head { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; +}; + #define TRACE_BUF_SIZE 1024 struct trace_array; @@ -832,6 +843,8 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_TAIL 0x100 #define TRACE_GRAPH_SLEEP_TIME 0x200 #define TRACE_GRAPH_GRAPH_TIME 0x400 +#define TRACE_GRAPH_PRINT_RETVAL 0x800 +#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 778200dd8ede..5fe525f1b8cc 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -31,7 +31,7 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) /* Common ftrace options */ xbc_node_for_each_array_value(node, "options", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { pr_err("String is too long: %s\n", p); continue; } @@ -87,7 +87,7 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) const char *p; xbc_node_for_each_array_value(node, "events", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) { pr_err("String is too long: %s\n", p); continue; } @@ -486,7 +486,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, p = xbc_node_find_value(enode, "filter", NULL); if (p && *p != '\0') { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) pr_err("filter string is too long: %s\n", p); else if (apply_event_filter(file, buf) < 0) pr_err("Failed to apply filter: %s\n", buf); @@ -494,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index cd41e863b51c..340b2fa98218 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -86,6 +86,30 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, ); /* Function return entry */ +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_packed( unsigned long, ret, func ) + __field_packed( unsigned long, ret, retval ) + __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, overrun ) + __field_packed( unsigned long long, ret, calltime) + __field_packed( unsigned long long, ret, rettime ) + ), + + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + (void *)__entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth, __entry->retval) +); + +#else + FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, @@ -105,6 +129,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __entry->depth) ); +#endif + /* * Context switch trace entry - which task (and prio) we switched from/to: * diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 67e854979d53..cb0077ba2b49 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -227,37 +227,6 @@ error: return ERR_PTR(ret); } -static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) -{ - struct probe_arg *parg = &ep->tp.args[i]; - struct ftrace_event_field *field; - struct list_head *head; - int ret = -ENOENT; - - head = trace_get_fields(ep->event); - list_for_each_entry(field, head, link) { - if (!strcmp(parg->code->data, field->name)) { - kfree(parg->code->data); - parg->code->data = field; - return 0; - } - } - - /* - * Argument not found on event. But allow for comm and COMM - * to be used to get the current->comm. - */ - if (strcmp(parg->code->data, "COMM") == 0 || - strcmp(parg->code->data, "comm") == 0) { - parg->code->op = FETCH_OP_COMM; - ret = 0; - } - - kfree(parg->code->data); - parg->code->data = NULL; - return ret; -} - static int eprobe_event_define_fields(struct trace_event_call *event_call) { struct eprobe_trace_entry_head field; @@ -817,19 +786,16 @@ find_and_get_event(const char *system, const char *event_name) static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) { - unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT; + struct traceprobe_parse_context ctx = { + .event = ep->event, + .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT, + }; int ret; - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags); + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); if (ret) return ret; - if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) { - ret = trace_eprobe_tp_arg_update(ep, i); - if (ret) - trace_probe_log_err(0, BAD_ATTACH_ARG); - } - /* Handle symbols "@" */ if (!ret) ret = traceprobe_update_arg(&ep->tp.args[i]); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 654ffa40457a..5d6ae4eae510 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -194,6 +194,8 @@ static int trace_define_generic_fields(void) __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); + __generic_field(char *, stacktrace, FILTER_STACKTRACE); + __generic_field(char *, STACKTRACE, FILTER_STACKTRACE); return ret; } @@ -2831,7 +2833,7 @@ static __init int setup_trace_triggers(char *str) char *buf; int i; - strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; disable_tracing_selftest("running event triggers"); @@ -3621,7 +3623,7 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { - strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; disable_tracing_selftest("running event tracing"); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 486cca3c2b75..b97d3ad832f1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1364,7 +1364,7 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; else - field_name = "stacktrace"; + field_name = "common_stacktrace"; } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; @@ -2367,7 +2367,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; - } else if (strcmp(field_name, "stacktrace") == 0) { + } else if (strcmp(field_name, "common_stacktrace") == 0) { *flags |= HIST_FIELD_FL_STACKTRACE; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; @@ -2378,11 +2378,15 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (!field || !field->size) { /* * For backward compatibility, if field_name - * was "cpu", then we treat this the same as - * common_cpu. This also works for "CPU". + * was "cpu" or "stacktrace", then we treat this + * the same as common_cpu and common_stacktrace + * respectively. This also works for "CPU", and + * "STACKTRACE". */ if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; + } else if (field && field->filter_type == FILTER_STACKTRACE) { + *flags |= HIST_FIELD_FL_STACKTRACE; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); @@ -4238,13 +4242,19 @@ static int __create_val_field(struct hist_trigger_data *hist_data, goto out; } - /* Some types cannot be a value */ - if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | - HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | - HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | - HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) { - hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); - ret = -EINVAL; + /* values and variables should not have some modifiers */ + if (hist_field->flags & HIST_FIELD_FL_VAR) { + /* Variable */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2)) + goto err; + } else { + /* Value */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | + HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | + HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) + goto err; } hist_data->fields[val_idx] = hist_field; @@ -4256,6 +4266,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data, ret = -EINVAL; out: return ret; + err: + hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); + return -EINVAL; } static int create_val_field(struct hist_trigger_data *hist_data, @@ -5385,7 +5398,7 @@ static void hist_trigger_print_key(struct seq_file *m, if (key_field->field) seq_printf(m, "%s.stacktrace", key_field->field->name); else - seq_puts(m, "stacktrace:\n"); + seq_puts(m, "common_stacktrace:\n"); hist_trigger_stacktrace_print(m, key + key_field->offset, HIST_STACKTRACE_DEPTH); @@ -5968,7 +5981,7 @@ static int event_hist_trigger_print(struct seq_file *m, if (field->field) seq_printf(m, "%s.stacktrace", field->field->name); else - seq_puts(m, "stacktrace"); + seq_puts(m, "common_stacktrace"); } else hist_field_print(m, field); } diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index d6b4935a78c0..abe805d471eb 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -217,7 +217,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) char *addr = (char *)(unsigned long) val; if (field->filter_type == FILTER_STATIC_STRING) { - strlcpy(entry + field->offset, addr, field->size); + strscpy(entry + field->offset, addr, field->size); } else if (field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_RDYN_STRING) { int str_len = strlen(addr) + 1; @@ -232,7 +232,7 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) } entry = *pentry; - strlcpy(entry + (entry_size - str_len), addr, str_len); + strscpy(entry + (entry_size - str_len), addr, str_len); str_item = (u32 *)(entry + field->offset); if (field->filter_type == FILTER_RDYN_STRING) str_loc -= field->offset + field->size; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b1ecd7677642..4f5e74bbdab2 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -50,6 +50,18 @@ #define EVENT_STATUS_OTHER BIT(7) /* + * User register flags are not allowed yet, keep them here until we are + * ready to expose them out to the user ABI. + */ +enum user_reg_flag { + /* Event will not delete upon last reference closing */ + USER_EVENT_REG_PERSIST = 1U << 0, + + /* This value or above is currently non-ABI */ + USER_EVENT_REG_MAX = 1U << 1, +}; + +/* * Stores the system name, tables, and locks for a group of events. This * allows isolation for events by various means. */ @@ -85,8 +97,10 @@ struct user_event { struct hlist_node node; struct list_head fields; struct list_head validators; + struct work_struct put_work; refcount_t refcnt; int min_size; + int reg_flags; char status; }; @@ -96,12 +110,12 @@ struct user_event { * these to track enablement sites that are tied to an event. */ struct user_event_enabler { - struct list_head link; + struct list_head mm_enablers_link; struct user_event *event; unsigned long addr; /* Track enable bit, flags, etc. Aligned for bitops. */ - unsigned int values; + unsigned long values; }; /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */ @@ -116,7 +130,9 @@ struct user_event_enabler { /* Only duplicate the bit value */ #define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK -#define ENABLE_BITOPS(e) ((unsigned long *)&(e)->values) +#define ENABLE_BITOPS(e) (&(e)->values) + +#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK)) /* Used for asynchronous faulting in of pages */ struct user_event_enabler_fault { @@ -153,7 +169,7 @@ struct user_event_file_info { #define VALIDATOR_REL (1 << 1) struct user_event_validator { - struct list_head link; + struct list_head user_event_link; int offset; int flags; }; @@ -163,76 +179,151 @@ typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser); + struct user_event **newuser, int reg_flags); static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); +static int destroy_user_event(struct user_event *user); static u32 user_event_key(char *name) { return jhash(name, strlen(name), 0); } -static void user_event_group_destroy(struct user_event_group *group) +static struct user_event *user_event_get(struct user_event *user) { - kfree(group->system_name); - kfree(group); + refcount_inc(&user->refcnt); + + return user; } -static char *user_event_group_system_name(struct user_namespace *user_ns) +static void delayed_destroy_user_event(struct work_struct *work) { - char *system_name; - int len = sizeof(USER_EVENTS_SYSTEM) + 1; + struct user_event *user = container_of( + work, struct user_event, put_work); + + mutex_lock(&event_mutex); - if (user_ns != &init_user_ns) { + if (!refcount_dec_and_test(&user->refcnt)) + goto out; + + if (destroy_user_event(user)) { /* - * Unexpected at this point: - * We only currently support init_user_ns. - * When we enable more, this will trigger a failure so log. + * The only reason this would fail here is if we cannot + * update the visibility of the event. In this case the + * event stays in the hashtable, waiting for someone to + * attempt to delete it later. */ - pr_warn("user_events: Namespace other than init_user_ns!\n"); - return NULL; + pr_warn("user_events: Unable to delete event\n"); + refcount_set(&user->refcnt, 1); } +out: + mutex_unlock(&event_mutex); +} - system_name = kmalloc(len, GFP_KERNEL); +static void user_event_put(struct user_event *user, bool locked) +{ + bool delete; - if (!system_name) - return NULL; + if (unlikely(!user)) + return; - snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); + /* + * When the event is not enabled for auto-delete there will always + * be at least 1 reference to the event. During the event creation + * we initially set the refcnt to 2 to achieve this. In those cases + * the caller must acquire event_mutex and after decrement check if + * the refcnt is 1, meaning this is the last reference. When auto + * delete is enabled, there will only be 1 ref, IE: refcnt will be + * only set to 1 during creation to allow the below checks to go + * through upon the last put. The last put must always be done with + * the event mutex held. + */ + if (!locked) { + lockdep_assert_not_held(&event_mutex); + delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex); + } else { + lockdep_assert_held(&event_mutex); + delete = refcount_dec_and_test(&user->refcnt); + } - return system_name; + if (!delete) + return; + + /* + * We now have the event_mutex in all cases, which ensures that + * no new references will be taken until event_mutex is released. + * New references come through find_user_event(), which requires + * the event_mutex to be held. + */ + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* We should not get here when persist flag is set */ + pr_alert("BUG: Auto-delete engaged on persistent event\n"); + goto out; + } + + /* + * Unfortunately we have to attempt the actual destroy in a work + * queue. This is because not all cases handle a trace_event_call + * being removed within the class->reg() operation for unregister. + */ + INIT_WORK(&user->put_work, delayed_destroy_user_event); + + /* + * Since the event is still in the hashtable, we have to re-inc + * the ref count to 1. This count will be decremented and checked + * in the work queue to ensure it's still the last ref. This is + * needed because a user-process could register the same event in + * between the time of event_mutex release and the work queue + * running the delayed destroy. If we removed the item now from + * the hashtable, this would result in a timing window where a + * user process would fail a register because the trace_event_call + * register would fail in the tracing layers. + */ + refcount_set(&user->refcnt, 1); + + if (WARN_ON_ONCE(!schedule_work(&user->put_work))) { + /* + * If we fail we must wait for an admin to attempt delete or + * another register/close of the event, whichever is first. + */ + pr_warn("user_events: Unable to queue delayed destroy\n"); + } +out: + /* Ensure if we didn't have event_mutex before we unlock it */ + if (!locked) + mutex_unlock(&event_mutex); } -static inline struct user_event_group -*user_event_group_from_user_ns(struct user_namespace *user_ns) +static void user_event_group_destroy(struct user_event_group *group) { - if (user_ns == &init_user_ns) - return init_group; - - return NULL; + kfree(group->system_name); + kfree(group); } -static struct user_event_group *current_user_event_group(void) +static char *user_event_group_system_name(void) { - struct user_namespace *user_ns = current_user_ns(); - struct user_event_group *group = NULL; + char *system_name; + int len = sizeof(USER_EVENTS_SYSTEM) + 1; - while (user_ns) { - group = user_event_group_from_user_ns(user_ns); + system_name = kmalloc(len, GFP_KERNEL); - if (group) - break; + if (!system_name) + return NULL; - user_ns = user_ns->parent; - } + snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); - return group; + return system_name; +} + +static struct user_event_group *current_user_event_group(void) +{ + return init_group; } -static struct user_event_group -*user_event_group_create(struct user_namespace *user_ns) +static struct user_event_group *user_event_group_create(void) { struct user_event_group *group; @@ -241,7 +332,7 @@ static struct user_event_group if (!group) return NULL; - group->system_name = user_event_group_system_name(user_ns); + group->system_name = user_event_group_system_name(); if (!group->system_name) goto error; @@ -257,12 +348,13 @@ error: return NULL; }; -static void user_event_enabler_destroy(struct user_event_enabler *enabler) +static void user_event_enabler_destroy(struct user_event_enabler *enabler, + bool locked) { - list_del_rcu(&enabler->link); + list_del_rcu(&enabler->mm_enablers_link); /* No longer tracking the event via the enabler */ - refcount_dec(&enabler->event->refcnt); + user_event_put(enabler->event, locked); kfree(enabler); } @@ -324,7 +416,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) /* User asked for enabler to be removed during fault */ if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) { - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); goto out; } @@ -406,7 +498,7 @@ static int user_event_enabler_write(struct user_event_mm *mm, return -EBUSY; ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, - &page, NULL, NULL); + &page, NULL); if (unlikely(ret <= 0)) { if (!fixup_fault) @@ -423,9 +515,9 @@ static int user_event_enabler_write(struct user_event_mm *mm, /* Update bit atomically, user tracers must be atomic as well */ if (enabler->event && enabler->event->status) - set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); + set_bit(ENABLE_BIT(enabler), ptr); else - clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); + clear_bit(ENABLE_BIT(enabler), ptr); kunmap_local(kaddr); unpin_user_pages_dirty_lock(&page, 1, true); @@ -437,11 +529,9 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, unsigned long uaddr, unsigned char bit) { struct user_event_enabler *enabler; - struct user_event_enabler *next; - list_for_each_entry_safe(enabler, next, &mm->enablers, link) { - if (enabler->addr == uaddr && - (enabler->values & ENABLE_VAL_BIT_MASK) == bit) + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { + if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) return true; } @@ -451,23 +541,36 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, static void user_event_enabler_update(struct user_event *user) { struct user_event_enabler *enabler; - struct user_event_mm *mm = user_event_mm_get_all(user); struct user_event_mm *next; + struct user_event_mm *mm; int attempt; + lockdep_assert_held(&event_mutex); + + /* + * We need to build a one-shot list of all the mms that have an + * enabler for the user_event passed in. This list is only valid + * while holding the event_mutex. The only reason for this is due + * to the global mm list being RCU protected and we use methods + * which can wait (mmap_read_lock and pin_user_pages_remote). + * + * NOTE: user_event_mm_get_all() increments the ref count of each + * mm that is added to the list to prevent removal timing windows. + * We must always put each mm after they are used, which may wait. + */ + mm = user_event_mm_get_all(user); + while (mm) { next = mm->next; mmap_read_lock(mm->mm); - rcu_read_lock(); - list_for_each_entry_rcu(enabler, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { if (enabler->event == user) { attempt = 0; user_event_enabler_write(mm, enabler, true, &attempt); } } - rcu_read_unlock(); mmap_read_unlock(mm->mm); user_event_mm_put(mm); mm = next; @@ -488,14 +591,14 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, if (!enabler) return false; - enabler->event = orig->event; + enabler->event = user_event_get(orig->event); enabler->addr = orig->addr; /* Only dup part of value (ignore future flags, etc) */ enabler->values = orig->values & ENABLE_VAL_DUP_MASK; - refcount_inc(&enabler->event->refcnt); - list_add_rcu(&enabler->link, &mm->enablers); + /* Enablers not exposed yet, RCU not required */ + list_add(&enabler->mm_enablers_link, &mm->enablers); return true; } @@ -514,6 +617,14 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) struct user_event_mm *mm; /* + * We use the mm->next field to build a one-shot list from the global + * RCU protected list. To build this list the event_mutex must be held. + * This lets us build a list without requiring allocs that could fail + * when user based events are most wanted for diagnostics. + */ + lockdep_assert_held(&event_mutex); + + /* * We do not want to block fork/exec while enablements are being * updated, so we use RCU to walk the current tasks that have used * user_events ABI for 1 or more events. Each enabler found in each @@ -525,23 +636,24 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) */ rcu_read_lock(); - list_for_each_entry_rcu(mm, &user_event_mms, link) - list_for_each_entry_rcu(enabler, &mm->enablers, link) + list_for_each_entry_rcu(mm, &user_event_mms, mms_link) { + list_for_each_entry_rcu(enabler, &mm->enablers, mm_enablers_link) { if (enabler->event == user) { mm->next = found; found = user_event_mm_get(mm); break; } + } + } rcu_read_unlock(); return found; } -static struct user_event_mm *user_event_mm_create(struct task_struct *t) +static struct user_event_mm *user_event_mm_alloc(struct task_struct *t) { struct user_event_mm *user_mm; - unsigned long flags; user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT); @@ -553,12 +665,6 @@ static struct user_event_mm *user_event_mm_create(struct task_struct *t) refcount_set(&user_mm->refcnt, 1); refcount_set(&user_mm->tasks, 1); - spin_lock_irqsave(&user_event_mms_lock, flags); - list_add_rcu(&user_mm->link, &user_event_mms); - spin_unlock_irqrestore(&user_event_mms_lock, flags); - - t->user_event_mm = user_mm; - /* * The lifetime of the memory descriptor can slightly outlast * the task lifetime if a ref to the user_event_mm is taken @@ -572,6 +678,17 @@ static struct user_event_mm *user_event_mm_create(struct task_struct *t) return user_mm; } +static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_struct *t) +{ + unsigned long flags; + + spin_lock_irqsave(&user_event_mms_lock, flags); + list_add_rcu(&user_mm->mms_link, &user_event_mms); + spin_unlock_irqrestore(&user_event_mms_lock, flags); + + t->user_event_mm = user_mm; +} + static struct user_event_mm *current_user_event_mm(void) { struct user_event_mm *user_mm = current->user_event_mm; @@ -579,10 +696,12 @@ static struct user_event_mm *current_user_event_mm(void) if (user_mm) goto inc; - user_mm = user_event_mm_create(current); + user_mm = user_event_mm_alloc(current); if (!user_mm) goto error; + + user_event_mm_attach(user_mm, current); inc: refcount_inc(&user_mm->refcnt); error: @@ -593,8 +712,8 @@ static void user_event_mm_destroy(struct user_event_mm *mm) { struct user_event_enabler *enabler, *next; - list_for_each_entry_safe(enabler, next, &mm->enablers, link) - user_event_enabler_destroy(enabler); + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) + user_event_enabler_destroy(enabler, false); mmdrop(mm->mm); kfree(mm); @@ -630,7 +749,7 @@ void user_event_mm_remove(struct task_struct *t) /* Remove the mm from the list, so it can no longer be enabled */ spin_lock_irqsave(&user_event_mms_lock, flags); - list_del_rcu(&mm->link); + list_del_rcu(&mm->mms_link); spin_unlock_irqrestore(&user_event_mms_lock, flags); /* @@ -670,7 +789,7 @@ void user_event_mm_remove(struct task_struct *t) void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) { - struct user_event_mm *mm = user_event_mm_create(t); + struct user_event_mm *mm = user_event_mm_alloc(t); struct user_event_enabler *enabler; if (!mm) @@ -678,16 +797,18 @@ void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) rcu_read_lock(); - list_for_each_entry_rcu(enabler, &old_mm->enablers, link) + list_for_each_entry_rcu(enabler, &old_mm->enablers, mm_enablers_link) { if (!user_event_enabler_dup(enabler, mm)) goto error; + } rcu_read_unlock(); + user_event_mm_attach(mm, t); return; error: rcu_read_unlock(); - user_event_mm_remove(t); + user_event_mm_destroy(mm); } static bool current_user_event_enabler_exists(unsigned long uaddr, @@ -747,8 +868,8 @@ retry: * exit or run exec(), which includes forks and clones. */ if (!*write_result) { - refcount_inc(&enabler->event->refcnt); - list_add_rcu(&enabler->link, &user_mm->enablers); + user_event_get(user); + list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); } mutex_unlock(&event_mutex); @@ -770,7 +891,12 @@ out: static __always_inline __must_check bool user_event_last_ref(struct user_event *user) { - return refcount_read(&user->refcnt) == 1; + int last = 0; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) + last = 1; + + return refcount_read(&user->refcnt) == last; } static __always_inline __must_check @@ -809,7 +935,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call) * Upon success user_event has its ref count increased by 1. */ static int user_event_parse_cmd(struct user_event_group *group, - char *raw_command, struct user_event **newuser) + char *raw_command, struct user_event **newuser, + int reg_flags) { char *name = raw_command; char *args = strpbrk(name, " "); @@ -823,7 +950,7 @@ static int user_event_parse_cmd(struct user_event_group *group, if (flags) *flags++ = '\0'; - return user_event_parse(group, name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser, reg_flags); } static int user_field_array_size(const char *type) @@ -904,8 +1031,8 @@ static void user_event_destroy_validators(struct user_event *user) struct user_event_validator *validator, *next; struct list_head *head = &user->validators; - list_for_each_entry_safe(validator, next, head, link) { - list_del(&validator->link); + list_for_each_entry_safe(validator, next, head, user_event_link) { + list_del(&validator->user_event_link); kfree(validator); } } @@ -959,7 +1086,7 @@ add_validator: validator->offset = offset; /* Want sequential access when validating */ - list_add_tail(&validator->link, &user->validators); + list_add_tail(&validator->user_event_link, &user->validators); add_field: field->type = type; @@ -1334,10 +1461,8 @@ static struct user_event *find_user_event(struct user_event_group *group, *outkey = key; hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) { - refcount_inc(&user->refcnt); - return user; - } + if (!strcmp(EVENT_NAME(user), name)) + return user_event_get(user); return NULL; } @@ -1349,7 +1474,7 @@ static int user_event_validate(struct user_event *user, void *data, int len) void *pos, *end = data + len; u32 loc, offset, size; - list_for_each_entry(validator, head, link) { + list_for_each_entry(validator, head, user_event_link) { pos = data + validator->offset; /* Already done min_size check, no bounds check here */ @@ -1399,7 +1524,7 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i, if (unlikely(!entry)) return; - if (unlikely(!copy_nofault(entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && @@ -1440,7 +1565,7 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i, perf_fetch_caller_regs(regs); - if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && @@ -1551,12 +1676,12 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: - refcount_inc(&user->refcnt); + user_event_get(user); update_enable_bit_for(user); return 0; dec: update_enable_bit_for(user); - refcount_dec(&user->refcnt); + user_event_put(user, true); return 0; } @@ -1587,10 +1712,11 @@ static int user_event_create(const char *raw_command) mutex_lock(&group->reg_mutex); - ret = user_event_parse_cmd(group, name, &user); + /* Dyn events persist, otherwise they would cleanup immediately */ + ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST); if (!ret) - refcount_dec(&user->refcnt); + user_event_put(user, false); mutex_unlock(&group->reg_mutex); @@ -1712,6 +1838,8 @@ static bool user_event_match(const char *system, const char *event, if (match && argc > 0) match = user_fields_match(user, argc, argv); + else if (match && argc == 0) + match = list_empty(&user->fields); return match; } @@ -1748,11 +1876,17 @@ static int user_event_trace_register(struct user_event *user) */ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser) + struct user_event **newuser, int reg_flags) { int ret; u32 key; struct user_event *user; + int argc = 0; + char **argv; + + /* User register flags are not ready yet */ + if (reg_flags != 0 || flags != NULL) + return -EINVAL; /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); @@ -1760,13 +1894,35 @@ static int user_event_parse(struct user_event_group *group, char *name, mutex_unlock(&event_mutex); if (user) { - *newuser = user; - /* - * Name is allocated by caller, free it since it already exists. - * Caller only worries about failure cases for freeing. - */ - kfree(name); + if (args) { + argv = argv_split(GFP_KERNEL, args, &argc); + if (!argv) { + ret = -ENOMEM; + goto error; + } + + ret = user_fields_match(user, argc, (const char **)argv); + argv_free(argv); + + } else + ret = list_empty(&user->fields); + + if (ret) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); + } else { + ret = -EADDRINUSE; + goto error; + } + return 0; +error: + user_event_put(user, false); + return ret; } user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); @@ -1819,8 +1975,15 @@ static int user_event_parse(struct user_event_group *group, char *name, if (ret) goto put_user_lock; - /* Ensure we track self ref and caller ref (2) */ - refcount_set(&user->refcnt, 2); + user->reg_flags = reg_flags; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); + } else { + /* Ensure we track only caller ref (1) */ + refcount_set(&user->refcnt, 1); + } dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); @@ -1852,7 +2015,7 @@ static int delete_user_event(struct user_event_group *group, char *name) if (!user) return -ENOENT; - refcount_dec(&user->refcnt); + user_event_put(user, true); if (!user_event_last_ref(user)) return -EBUSY; @@ -1933,7 +2096,8 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) if (unlikely(faulted)) return -EFAULT; - } + } else + return -EBADF; return ret; } @@ -2011,9 +2175,7 @@ static int user_events_ref_add(struct user_event_file_info *info, for (i = 0; i < count; ++i) new_refs->events[i] = refs->events[i]; - new_refs->events[i] = user; - - refcount_inc(&user->refcnt); + new_refs->events[i] = user_event_get(user); rcu_assign_pointer(info->refs, new_refs); @@ -2044,8 +2206,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (ret) return ret; - /* Ensure no flags, since we don't support any yet */ - if (kreg->flags != 0) + /* Ensure only valid flags */ + if (kreg->flags & ~(USER_EVENT_REG_MAX-1)) return -EINVAL; /* Ensure supported size */ @@ -2117,7 +2279,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, return ret; } - ret = user_event_parse_cmd(info->group, name, &user); + ret = user_event_parse_cmd(info->group, name, &user, reg.flags); if (ret) { kfree(name); @@ -2127,7 +2289,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, ret = user_events_ref_add(info, user); /* No longer need parse ref, ref_add either worked or not */ - refcount_dec(&user->refcnt); + user_event_put(user, false); /* Positive number is index and valid */ if (ret < 0) @@ -2270,17 +2432,18 @@ static long user_events_ioctl_unreg(unsigned long uarg) */ mutex_lock(&event_mutex); - list_for_each_entry_safe(enabler, next, &mm->enablers, link) + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) { if (enabler->addr == reg.disable_addr && - (enabler->values & ENABLE_VAL_BIT_MASK) == reg.disable_bit) { + ENABLE_BIT(enabler) == reg.disable_bit) { set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); /* Removed at least one */ ret = 0; } + } mutex_unlock(&event_mutex); @@ -2333,7 +2496,6 @@ static int user_events_release(struct inode *node, struct file *file) struct user_event_file_info *info = file->private_data; struct user_event_group *group; struct user_event_refs *refs; - struct user_event *user; int i; if (!info) @@ -2357,12 +2519,9 @@ static int user_events_release(struct inode *node, struct file *file) * The underlying user_events are ref counted, and cannot be freed. * After this decrement, the user_events may be freed elsewhere. */ - for (i = 0; i < refs->count; ++i) { - user = refs->events[i]; + for (i = 0; i < refs->count; ++i) + user_event_put(refs->events[i], false); - if (user) - refcount_dec(&user->refcnt); - } out: file->private_data = NULL; @@ -2543,7 +2702,7 @@ static int __init trace_events_user_init(void) if (!fault_cache) return -ENOMEM; - init_group = user_event_group_create(&init_user_ns); + init_group = user_event_group_create(); if (!init_group) { kmem_cache_destroy(fault_cache); diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c new file mode 100644 index 000000000000..dfe2e546acdc --- /dev/null +++ b/kernel/trace/trace_fprobe.c @@ -0,0 +1,1199 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Fprobe-based tracing events + * Copyright (C) 2022 Google LLC. + */ +#define pr_fmt(fmt) "trace_fprobe: " fmt + +#include <linux/fprobe.h> +#include <linux/module.h> +#include <linux/rculist.h> +#include <linux/security.h> +#include <linux/tracepoint.h> +#include <linux/uaccess.h> + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" + +#define FPROBE_EVENT_SYSTEM "fprobes" +#define TRACEPOINT_EVENT_SYSTEM "tracepoints" +#define RETHOOK_MAXACTIVE_MAX 4096 + +static int trace_fprobe_create(const char *raw_command); +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_fprobe_release(struct dyn_event *ev); +static bool trace_fprobe_is_busy(struct dyn_event *ev); +static bool trace_fprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations trace_fprobe_ops = { + .create = trace_fprobe_create, + .show = trace_fprobe_show, + .is_busy = trace_fprobe_is_busy, + .free = trace_fprobe_release, + .match = trace_fprobe_match, +}; + +/* + * Fprobe event core functions + */ +struct trace_fprobe { + struct dyn_event devent; + struct fprobe fp; + const char *symbol; + struct tracepoint *tpoint; + struct module *mod; + struct trace_probe tp; +}; + +static bool is_trace_fprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_fprobe_ops; +} + +static struct trace_fprobe *to_trace_fprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_fprobe, devent); +} + +/** + * for_each_trace_fprobe - iterate over the trace_fprobe list + * @pos: the struct trace_fprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_fprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_fprobe(dpos) && (pos = to_trace_fprobe(dpos))) + +static bool trace_fprobe_is_return(struct trace_fprobe *tf) +{ + return tf->fp.exit_handler != NULL; +} + +static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) +{ + return tf->tpoint != NULL; +} + +static const char *trace_fprobe_symbol(struct trace_fprobe *tf) +{ + return tf->symbol ? tf->symbol : "unknown"; +} + +static bool trace_fprobe_is_busy(struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + + return trace_probe_is_enabled(&tf->tp); +} + +static bool trace_fprobe_match_command_head(struct trace_fprobe *tf, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + + if (!argc) + return true; + + snprintf(buf, sizeof(buf), "%s", trace_fprobe_symbol(tf)); + if (strcmp(buf, argv[0])) + return false; + argc--; argv++; + + return trace_probe_match_command_args(&tf->tp, argc, argv); +} + +static bool trace_fprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + + if (event[0] != '\0' && strcmp(trace_probe_name(&tf->tp), event)) + return false; + + if (system && strcmp(trace_probe_group_name(&tf->tp), system)) + return false; + + return trace_fprobe_match_command_head(tf, argc, argv); +} + +static bool trace_fprobe_is_registered(struct trace_fprobe *tf) +{ + return fprobe_is_registered(&tf->fp); +} + +/* + * Note that we don't verify the fetch_insn code, since it does not come + * from user space. + */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + struct pt_regs *regs = rec; + unsigned long val; + int ret; + +retry: + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_STACK: + val = regs_get_kernel_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = kernel_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; +#endif + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* function entry handler */ +static nokprobe_inline void +__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct fentry_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct trace_event_buffer fbuffer; + int dsize; + + if (WARN_ON_ONCE(call != trace_file->event_call)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tf->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tf->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + entry->ip = entry_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tf->tp) + __fentry_trace_func(tf, entry_ip, regs, link->file); +} +NOKPROBE_SYMBOL(fentry_trace_func); + +/* Kretprobe handler */ +static nokprobe_inline void +__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct fexit_trace_entry_head *entry; + struct trace_event_buffer fbuffer; + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + int dsize; + + if (WARN_ON_ONCE(call != trace_file->event_call)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tf->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tf->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + entry->func = entry_ip; + entry->ret_ip = ret_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tf->tp) + __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file); +} +NOKPROBE_SYMBOL(fexit_trace_func); + +#ifdef CONFIG_PERF_EVENTS + +static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct fentry_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return 0; + + dsize = __get_data_size(&tf->tp, regs); + __size = sizeof(*entry) + tf->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return 0; + + entry->ip = entry_ip; + memset(&entry[1], 0, dsize); + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); + return 0; +} +NOKPROBE_SYMBOL(fentry_perf_func); + +static void +fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct fexit_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + + dsize = __get_data_size(&tf->tp, regs); + __size = sizeof(*entry) + tf->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return; + + entry->func = entry_ip; + entry->ret_ip = ret_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); +} +NOKPROBE_SYMBOL(fexit_perf_func); +#endif /* CONFIG_PERF_EVENTS */ + +static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + int ret = 0; + + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + fentry_trace_func(tf, entry_ip, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + ret = fentry_perf_func(tf, entry_ip, regs); +#endif + return ret; +} +NOKPROBE_SYMBOL(fentry_dispatcher); + +static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + fexit_trace_func(tf, entry_ip, ret_ip, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + fexit_perf_func(tf, entry_ip, ret_ip, regs); +#endif +} +NOKPROBE_SYMBOL(fexit_dispatcher); + +static void free_trace_fprobe(struct trace_fprobe *tf) +{ + if (tf) { + trace_probe_cleanup(&tf->tp); + kfree(tf->symbol); + kfree(tf); + } +} + +/* + * Allocate new trace_probe and initialize it (including fprobe). + */ +static struct trace_fprobe *alloc_trace_fprobe(const char *group, + const char *event, + const char *symbol, + struct tracepoint *tpoint, + int maxactive, + int nargs, bool is_return) +{ + struct trace_fprobe *tf; + int ret = -ENOMEM; + + tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL); + if (!tf) + return ERR_PTR(ret); + + tf->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tf->symbol) + goto error; + + if (is_return) + tf->fp.exit_handler = fexit_dispatcher; + else + tf->fp.entry_handler = fentry_dispatcher; + + tf->tpoint = tpoint; + tf->fp.nr_maxactive = maxactive; + + ret = trace_probe_init(&tf->tp, event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&tf->devent, &trace_fprobe_ops); + return tf; +error: + free_trace_fprobe(tf); + return ERR_PTR(ret); +} + +static struct trace_fprobe *find_trace_fprobe(const char *event, + const char *group) +{ + struct dyn_event *pos; + struct trace_fprobe *tf; + + for_each_trace_fprobe(tf, pos) + if (strcmp(trace_probe_name(&tf->tp), event) == 0 && + strcmp(trace_probe_group_name(&tf->tp), group) == 0) + return tf; + return NULL; +} + +static inline int __enable_trace_fprobe(struct trace_fprobe *tf) +{ + if (trace_fprobe_is_registered(tf)) + enable_fprobe(&tf->fp); + + return 0; +} + +static void __disable_trace_fprobe(struct trace_probe *tp) +{ + struct trace_fprobe *tf; + + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + if (!trace_fprobe_is_registered(tf)) + continue; + disable_fprobe(&tf->fp); + } +} + +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_fprobe *tf; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (!enabled) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + /* TODO: check the fprobe is gone */ + __enable_trace_fprobe(tf); + } + } + + return 0; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) + __disable_trace_fprobe(tp); + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +/* Event entry printers */ +static enum print_line_t +print_fentry_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct fentry_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct fentry_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static enum print_line_t +print_fexit_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct fexit_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct fexit_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_puts(s, " <- "); + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + + out: + return trace_handle_return(s); +} + +static int fentry_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct fentry_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static int fexit_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct fexit_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static struct trace_event_functions fentry_funcs = { + .trace = print_fentry_event +}; + +static struct trace_event_functions fexit_funcs = { + .trace = print_fexit_event +}; + +static struct trace_event_fields fentry_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = fentry_event_define_fields }, + {} +}; + +static struct trace_event_fields fexit_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = fexit_event_define_fields }, + {} +}; + +static int fprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data); + +static inline void init_trace_event_call(struct trace_fprobe *tf) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + + if (trace_fprobe_is_return(tf)) { + call->event.funcs = &fexit_funcs; + call->class->fields_array = fexit_fields_array; + } else { + call->event.funcs = &fentry_funcs; + call->class->fields_array = fentry_fields_array; + } + + call->flags = TRACE_EVENT_FL_FPROBE; + call->class->reg = fprobe_register; +} + +static int register_fprobe_event(struct trace_fprobe *tf) +{ + init_trace_event_call(tf); + + return trace_probe_register_event_call(&tf->tp); +} + +static int unregister_fprobe_event(struct trace_fprobe *tf) +{ + return trace_probe_unregister_event_call(&tf->tp); +} + +/* Internal register function - just handle fprobe and flags */ +static int __register_trace_fprobe(struct trace_fprobe *tf) +{ + int i, ret; + + /* Should we need new LOCKDOWN flag for fprobe? */ + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + + if (trace_fprobe_is_registered(tf)) + return -EINVAL; + + for (i = 0; i < tf->tp.nr_args; i++) { + ret = traceprobe_update_arg(&tf->tp.args[i]); + if (ret) + return ret; + } + + /* Set/clear disabled flag according to tp->flag */ + if (trace_probe_is_enabled(&tf->tp)) + tf->fp.flags &= ~FPROBE_FL_DISABLED; + else + tf->fp.flags |= FPROBE_FL_DISABLED; + + if (trace_fprobe_is_tracepoint(tf)) { + struct tracepoint *tpoint = tf->tpoint; + unsigned long ip = (unsigned long)tpoint->probestub; + /* + * Here, we do 2 steps to enable fprobe on a tracepoint. + * At first, put __probestub_##TP function on the tracepoint + * and put a fprobe on the stub function. + */ + ret = tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); + if (ret < 0) + return ret; + return register_fprobe_ips(&tf->fp, &ip, 1); + } + + /* TODO: handle filter, nofilter or symbol list */ + return register_fprobe(&tf->fp, tf->symbol, NULL); +} + +/* Internal unregister function - just handle fprobe and flags */ +static void __unregister_trace_fprobe(struct trace_fprobe *tf) +{ + if (trace_fprobe_is_registered(tf)) { + unregister_fprobe(&tf->fp); + memset(&tf->fp, 0, sizeof(tf->fp)); + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = NULL; + tf->mod = NULL; + } + } +} + +/* TODO: make this trace_*probe common function */ +/* Unregister a trace_probe and probe_event */ +static int unregister_trace_fprobe(struct trace_fprobe *tf) +{ + /* If other probes are on the event, just unregister fprobe */ + if (trace_probe_has_sibling(&tf->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&tf->tp)) + return -EBUSY; + + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tf->tp))) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (unregister_fprobe_event(tf)) + return -EBUSY; + +unreg: + __unregister_trace_fprobe(tf); + dyn_event_remove(&tf->devent); + trace_probe_unlink(&tf->tp); + + return 0; +} + +static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig, + struct trace_fprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + int i; + + list_for_each_entry(orig, &tpe->probes, tp.list) { + if (strcmp(trace_fprobe_symbol(orig), + trace_fprobe_symbol(comp))) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + break; + } + + if (i == orig->tp.nr_args) + return true; + } + + return false; +} + +static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) +{ + int ret; + + if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to) || + trace_fprobe_is_tracepoint(tf) != trace_fprobe_is_tracepoint(to)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + return -EEXIST; + } + ret = trace_probe_compare_arg_type(&tf->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_fprobe_has_same_fprobe(to, tf)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + + /* Append to existing event */ + ret = trace_probe_append(&tf->tp, &to->tp); + if (ret) + return ret; + + ret = __register_trace_fprobe(tf); + if (ret) + trace_probe_unlink(&tf->tp); + else + dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); + + return ret; +} + +/* Register a trace_probe and probe_event */ +static int register_trace_fprobe(struct trace_fprobe *tf) +{ + struct trace_fprobe *old_tf; + int ret; + + mutex_lock(&event_mutex); + + old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), + trace_probe_group_name(&tf->tp)); + if (old_tf) { + ret = append_trace_fprobe(tf, old_tf); + goto end; + } + + /* Register new event */ + ret = register_fprobe_event(tf); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); + goto end; + } + + /* Register fprobe */ + ret = __register_trace_fprobe(tf); + if (ret < 0) + unregister_fprobe_event(tf); + else + dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); + +end: + mutex_unlock(&event_mutex); + return ret; +} + +#ifdef CONFIG_MODULES +static int __tracepoint_probe_module_cb(struct notifier_block *self, + unsigned long val, void *data) +{ + struct tp_module *tp_mod = data; + struct trace_fprobe *tf; + struct dyn_event *pos; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + mutex_lock(&event_mutex); + for_each_trace_fprobe(tf, pos) { + if (tp_mod->mod == tf->mod) { + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = NULL; + tf->mod = NULL; + } + } + mutex_unlock(&event_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block tracepoint_module_nb = { + .notifier_call = __tracepoint_probe_module_cb, +}; +#endif /* CONFIG_MODULES */ + +struct __find_tracepoint_cb_data { + const char *tp_name; + struct tracepoint *tpoint; +}; + +static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) + data->tpoint = tp; +} + +static struct tracepoint *find_tracepoint(const char *tp_name) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + }; + + for_each_kernel_tracepoint(__find_tracepoint_cb, &data); + + return data.tpoint; +} + +static int __trace_fprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * - Add fentry probe: + * f[:[GRP/][EVENT]] [MOD:]KSYM [FETCHARGS] + * - Add fexit probe: + * f[N][:[GRP/][EVENT]] [MOD:]KSYM%return [FETCHARGS] + * - Add tracepoint probe: + * t[:[GRP/][EVENT]] TRACEPOINT [FETCHARGS] + * + * Fetch args: + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth entry of stack (N:0-) + * $argN : fetch Nth argument (N:1-) + * $comm : fetch current task comm + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * Dereferencing memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. + */ + struct trace_fprobe *tf = NULL; + int i, len, new_argc = 0, ret = 0; + bool is_return = false; + char *symbol = NULL, *tmp = NULL; + const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + const char **new_argv = NULL; + int maxactive = 0; + char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; + char sbuf[KSYM_NAME_LEN]; + char abuf[MAX_BTF_ARGS_LEN]; + bool is_tracepoint = false; + struct tracepoint *tpoint = NULL; + struct traceprobe_parse_context ctx = { + .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, + }; + + if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) + return -ECANCELED; + + if (argv[0][0] == 't') { + is_tracepoint = true; + group = TRACEPOINT_EVENT_SYSTEM; + } + + trace_probe_log_init("trace_fprobe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) + event++; + + if (isdigit(argv[0][1])) { + if (event) + len = event - &argv[0][1] - 1; + else + len = strlen(&argv[0][1]); + if (len > MAX_EVENT_NAME_LEN - 1) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + memcpy(buf, &argv[0][1], len); + buf[len] = '\0'; + ret = kstrtouint(buf, 0, &maxactive); + if (ret || !maxactive) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + /* fprobe rethook instances are iterated over via a list. The + * maximum should stay reasonable. + */ + if (maxactive > RETHOOK_MAXACTIVE_MAX) { + trace_probe_log_err(1, MAXACT_TOO_BIG); + goto parse_error; + } + } + + trace_probe_log_set_index(1); + + /* a symbol(or tracepoint) must be specified */ + symbol = kstrdup(argv[1], GFP_KERNEL); + if (!symbol) + return -ENOMEM; + + tmp = strchr(symbol, '%'); + if (tmp) { + if (!is_tracepoint && !strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); + goto parse_error; + } + } + if (!is_return && maxactive) { + trace_probe_log_set_index(0); + trace_probe_log_err(1, BAD_MAXACT_TYPE); + goto parse_error; + } + + trace_probe_log_set_index(0); + if (event) { + ret = traceprobe_parse_event_name(&event, &group, gbuf, + event - argv[0]); + if (ret) + goto parse_error; + } + + if (!event) { + /* Make a new event name */ + if (is_tracepoint) + snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s", + isdigit(*symbol) ? "_" : "", symbol); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + is_return ? "exit" : "entry"); + sanitize_event_name(buf); + event = buf; + } + + if (is_return) + ctx.flags |= TPARG_FL_RETURN; + else + ctx.flags |= TPARG_FL_FENTRY; + + if (is_tracepoint) { + ctx.flags |= TPARG_FL_TPOINT; + tpoint = find_tracepoint(symbol); + if (!tpoint) { + trace_probe_log_set_index(1); + trace_probe_log_err(0, NO_TRACEPOINT); + goto parse_error; + } + ctx.funcname = kallsyms_lookup( + (unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); + } else + ctx.funcname = symbol; + + argc -= 2; argv += 2; + new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, + abuf, MAX_BTF_ARGS_LEN, &ctx); + if (IS_ERR(new_argv)) { + ret = PTR_ERR(new_argv); + new_argv = NULL; + goto out; + } + if (new_argv) { + argc = new_argc; + argv = new_argv; + } + + /* setup a probe */ + tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, + argc, is_return); + if (IS_ERR(tf)) { + ret = PTR_ERR(tf); + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto out; /* We know tf is not allocated */ + } + + if (is_tracepoint) + tf->mod = __module_text_address( + (unsigned long)tf->tpoint->probestub); + + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ctx.offset = 0; + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); + if (ret) + goto error; /* This can be -ENOMEM */ + } + + ret = traceprobe_set_print_fmt(&tf->tp, + is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL); + if (ret < 0) + goto error; + + ret = register_trace_fprobe(tf); + if (ret) { + trace_probe_log_set_index(1); + if (ret == -EILSEQ) + trace_probe_log_err(0, BAD_INSN_BNDRY); + else if (ret == -ENOENT) + trace_probe_log_err(0, BAD_PROBE_ADDR); + else if (ret != -ENOMEM && ret != -EEXIST) + trace_probe_log_err(0, FAIL_REG_PROBE); + goto error; + } + +out: + trace_probe_log_clear(); + kfree(new_argv); + kfree(symbol); + return ret; + +parse_error: + ret = -EINVAL; +error: + free_trace_fprobe(tf); + goto out; +} + +static int trace_fprobe_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_fprobe_create); +} + +static int trace_fprobe_release(struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + int ret = unregister_trace_fprobe(tf); + + if (!ret) + free_trace_fprobe(tf); + return ret; +} + +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + int i; + + if (trace_fprobe_is_tracepoint(tf)) + seq_putc(m, 't'); + else + seq_putc(m, 'f'); + if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) + seq_printf(m, "%d", tf->fp.nr_maxactive); + seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), + trace_probe_name(&tf->tp)); + + seq_printf(m, " %s%s", trace_fprobe_symbol(tf), + trace_fprobe_is_return(tf) ? "%return" : ""); + + for (i = 0; i < tf->tp.nr_args; i++) + seq_printf(m, " %s=%s", tf->tp.args[i].name, tf->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + */ +static int fprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_fprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_fprobe(event, file); + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return enable_trace_fprobe(event, NULL); + case TRACE_REG_PERF_UNREGISTER: + return disable_trace_fprobe(event, NULL); + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup fprobe + * events in postcore_initcall without tracefs. + */ +static __init int init_fprobe_trace_early(void) +{ + int ret; + + ret = dyn_event_register(&trace_fprobe_ops); + if (ret) + return ret; + +#ifdef CONFIG_MODULES + ret = register_tracepoint_module_notifier(&tracepoint_module_nb); + if (ret) + return ret; +#endif + + return 0; +} +core_initcall(init_fprobe_trace_early); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 203204cadf92..c35fbaab2a47 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -58,6 +58,12 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, /* Display function name after trailing } */ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + /* Display function return value ? */ + { TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) }, + /* Display function return value in hexadecimal format ? */ + { TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -619,6 +625,56 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, trace_seq_puts(s, "| "); } +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL + +static void print_graph_retval(struct trace_seq *s, unsigned long retval, + bool leaf, void *func, bool hex_format) +{ + unsigned long err_code = 0; + + if (retval == 0 || hex_format) + goto done; + + /* Check if the return value matches the negative format */ + if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) && + (((u64)retval) >> 32) == 0) { + /* sign extension */ + err_code = (unsigned long)(s32)retval; + } else { + err_code = retval; + } + + if (!IS_ERR_VALUE(err_code)) + err_code = 0; + +done: + if (leaf) { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "%ps(); /* = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "%ps(); /* = %ld */\n", + func, err_code); + } else { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "} /* %ps = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "} /* %ps = %ld */\n", + func, err_code); + } +} + +#else + +#define __TRACE_GRAPH_PRINT_RETVAL 0 + +#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0) + +#endif + /* Case of a leaf function on its call entry */ static enum print_line_t print_graph_entry_leaf(struct trace_iterator *iter, @@ -663,7 +719,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); - trace_seq_printf(s, "%ps();\n", (void *)call->func); + /* + * Write out the function return value if the option function-retval is + * enabled. + */ + if (flags & __TRACE_GRAPH_PRINT_RETVAL) + print_graph_retval(s, graph_ret->retval, true, (void *)call->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + else + trace_seq_printf(s, "%ps();\n", (void *)call->func); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -942,16 +1006,25 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, trace_seq_putc(s, ' '); /* - * If the return function does not have a matching entry, - * then the entry was lost. Instead of just printing - * the '}' and letting the user guess what function this - * belongs to, write out the function name. Always do - * that if the funcgraph-tail option is enabled. + * Always write out the function name and its return value if the + * function-retval option is enabled. */ - if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) - trace_seq_puts(s, "}\n"); - else - trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + if (flags & __TRACE_GRAPH_PRINT_RETVAL) { + print_graph_retval(s, trace->retval, false, (void *)trace->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + } else { + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. + */ + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) + trace_seq_puts(s, "}\n"); + else + trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + } /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 59cda19a9033..23dba01831f7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,7 +30,7 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; static int __init set_kprobe_boot_events(char *str) { - strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); return 1; @@ -732,9 +732,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_kprobe *tk = NULL; - int i, len, ret = 0; + int i, len, new_argc = 0, ret = 0; bool is_return = false; char *symbol = NULL, *tmp = NULL; + const char **new_argv = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; enum probe_print_type ptype; int maxactive = 0; @@ -742,7 +743,8 @@ static int __trace_kprobe_create(int argc, const char *argv[]) void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; - unsigned int flags = TPARG_FL_KERNEL; + char abuf[MAX_BTF_ARGS_LEN]; + struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { case 'r': @@ -764,7 +766,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (isdigit(argv[0][1])) { if (!is_return) { - trace_probe_log_err(1, MAXACT_NO_KPROBE); + trace_probe_log_err(1, BAD_MAXACT_TYPE); goto parse_error; } if (event) @@ -823,10 +825,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) goto parse_error; } if (is_return) - flags |= TPARG_FL_RETURN; + ctx.flags |= TPARG_FL_RETURN; ret = kprobe_on_func_entry(NULL, symbol, offset); - if (ret == 0) - flags |= TPARG_FL_FENTRY; + if (ret == 0 && !is_return) + ctx.flags |= TPARG_FL_FENTRY; /* Defer the ENOENT case until register kprobe */ if (ret == -EINVAL && is_return) { trace_probe_log_err(0, BAD_RETPROBE); @@ -854,21 +856,35 @@ static int __trace_kprobe_create(int argc, const char *argv[]) event = buf; } + argc -= 2; argv += 2; + ctx.funcname = symbol; + new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, + abuf, MAX_BTF_ARGS_LEN, &ctx); + if (IS_ERR(new_argv)) { + ret = PTR_ERR(new_argv); + new_argv = NULL; + goto out; + } + if (new_argv) { + argc = new_argc; + argv = new_argv; + } + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, - argc - 2, is_return); + argc, is_return); if (IS_ERR(tk)) { ret = PTR_ERR(tk); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); goto out; /* We know tk is not allocated */ } - argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); + ctx.offset = 0; + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); if (ret) goto error; /* This can be -ENOMEM */ } @@ -892,6 +908,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) out: trace_probe_log_clear(); + kfree(new_argv); kfree(symbol); return ret; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index efbbec2caff8..bd0d01d00fb9 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -181,6 +181,7 @@ struct osn_irq { #define IRQ_CONTEXT 0 #define THREAD_CONTEXT 1 +#define THREAD_URET 2 /* * sofirq runtime info. */ @@ -238,6 +239,7 @@ struct timerlat_variables { u64 abs_period; bool tracing_thread; u64 count; + bool uthread_migrate; }; static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); @@ -1181,6 +1183,78 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) osn_var->thread.arrival_time = 0; } +#ifdef CONFIG_TIMERLAT_TRACER +/* + * osnoise_stop_exception - Stop tracing and the tracer. + */ +static __always_inline void osnoise_stop_exception(char *msg, int cpu) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + tracer_tracing_off(tr); + } + rcu_read_unlock(); +} + +/* + * trace_sched_migrate_callback - sched:sched_migrate_task trace event handler + * + * his function is hooked to the sched:sched_migrate_task trace event, and monitors + * timerlat user-space thread migration. + */ +static void trace_sched_migrate_callback(void *data, struct task_struct *p, int dest_cpu) +{ + struct osnoise_variables *osn_var; + long cpu = task_cpu(p); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + if (osn_var->pid == p->pid && dest_cpu != cpu) { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user-thread migrated\n"); + osnoise_stop_exception("timerlat user-thread migrated", cpu); + } +} + +static int register_migration_monitor(void) +{ + int ret = 0; + + /* + * Timerlat thread migration check is only required when running timerlat in user-space. + * Thus, enable callback only if timerlat is set with no workload. + */ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + + return ret; +} + +static void unregister_migration_monitor(void) +{ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); +} +#else +static int register_migration_monitor(void) +{ + return 0; +} +static void unregister_migration_monitor(void) {} +#endif /* * trace_sched_switch - sched:sched_switch trace event handler * @@ -1204,7 +1278,7 @@ trace_sched_switch_callback(void *data, bool preempt, } /* - * hook_thread_events - Hook the insturmentation for thread noise + * hook_thread_events - Hook the instrumentation for thread noise * * Hook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1217,11 +1291,19 @@ static int hook_thread_events(void) if (ret) return -EINVAL; + ret = register_migration_monitor(); + if (ret) + goto out_unreg; + return 0; + +out_unreg: + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + return -EINVAL; } /* - * unhook_thread_events - *nhook the insturmentation for thread noise + * unhook_thread_events - unhook the instrumentation for thread noise * * Unook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. @@ -1229,6 +1311,7 @@ static int hook_thread_events(void) static void unhook_thread_events(void) { unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + unregister_migration_monitor(); } /* @@ -1286,6 +1369,22 @@ static __always_inline void osnoise_stop_tracing(void) } /* + * osnoise_has_tracing_on - Check if there is at least one instance on + */ +static __always_inline int osnoise_has_tracing_on(void) +{ + struct osnoise_instance *inst; + int trace_is_on = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) + trace_is_on += tracer_tracing_is_on(inst->tr); + rcu_read_unlock(); + + return trace_is_on; +} + +/* * notify_new_max_latency - Notify a new max latency via fsnotify interface. */ static void notify_new_max_latency(u64 latency) @@ -1517,13 +1616,16 @@ static struct cpumask save_cpumask; /* * osnoise_sleep - sleep until the next period */ -static void osnoise_sleep(void) +static void osnoise_sleep(bool skip_period) { u64 interval; ktime_t wake_time; mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + if (skip_period) + interval = osnoise_data.sample_period; + else + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; mutex_unlock(&interface_lock); /* @@ -1546,6 +1648,39 @@ static void osnoise_sleep(void) } /* + * osnoise_migration_pending - checks if the task needs to migrate + * + * osnoise/timerlat threads are per-cpu. If there is a pending request to + * migrate the thread away from the current CPU, something bad has happened. + * Play the good citizen and leave. + * + * Returns 0 if it is safe to continue, 1 otherwise. + */ +static inline int osnoise_migration_pending(void) +{ + if (!current->migration_pending) + return 0; + + /* + * If migration is pending, there is a task waiting for the + * tracer to enable migration. The tracer does not allow migration, + * thus: taint and leave to unblock the blocked thread. + */ + osnoise_taint("migration requested to osnoise threads, leaving."); + + /* + * Unset this thread from the threads managed by the interface. + * The tracers are responsible for cleaning their env before + * exiting. + */ + mutex_lock(&interface_lock); + this_cpu_osn_var()->kthread = NULL; + mutex_unlock(&interface_lock); + + return 1; +} + +/* * osnoise_main - The osnoise detection kernel thread * * Calls run_osnoise() function to measure the osnoise for the configured runtime, @@ -1553,12 +1688,35 @@ static void osnoise_sleep(void) */ static int osnoise_main(void *data) { + unsigned long flags; + + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); while (!kthread_should_stop()) { + if (osnoise_migration_pending()) + break; + + /* skip a period if tracing is off on all instances */ + if (!osnoise_has_tracing_on()) { + osnoise_sleep(true); + continue; + } + run_osnoise(); - osnoise_sleep(); + osnoise_sleep(false); } + migrate_enable(); return 0; } @@ -1652,6 +1810,8 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) osnoise_stop_tracing(); notify_new_max_latency(diff); + wake_up_process(tlat->kthread); + return HRTIMER_NORESTART; } } @@ -1704,6 +1864,7 @@ static int timerlat_main(void *data) struct timerlat_variables *tlat = this_cpu_tmr_var(); struct timerlat_sample s; struct sched_param sp; + unsigned long flags; u64 now, diff; /* @@ -1712,6 +1873,18 @@ static int timerlat_main(void *data) sp.sched_priority = DEFAULT_TIMERLAT_PRIO; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + tlat->count = 0; tlat->tracing_thread = false; @@ -1729,6 +1902,7 @@ static int timerlat_main(void *data) osn_var->sampling = 1; while (!kthread_should_stop()) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); diff = now - tlat->abs_period; @@ -1747,10 +1921,14 @@ static int timerlat_main(void *data) if (time_to_us(diff) >= osnoise_data.stop_tracing_total) osnoise_stop_tracing(); + if (osnoise_migration_pending()) + break; + wait_next_period(tlat); } hrtimer_cancel(&tlat->timer); + migrate_enable(); return 0; } #else /* CONFIG_TIMERLAT_TRACER */ @@ -1769,10 +1947,24 @@ static void stop_kthread(unsigned int cpu) kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; if (kthread) { - kthread_stop(kthread); + if (test_bit(OSN_WORKLOAD, &osnoise_options)) { + kthread_stop(kthread); + } else { + /* + * This is a user thread waiting on the timerlat_fd. We need + * to close all users, and the best way to guarantee this is + * by killing the thread. NOTE: this is a purpose specific file. + */ + kill_pid(kthread->thread_pid, SIGKILL, 1); + put_task_struct(kthread); + } per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } else { + /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + /* + * This is set in the osnoise tracer case. + */ per_cpu(per_cpu_osnoise_var, cpu).sampling = false; barrier(); return; @@ -1817,7 +2009,6 @@ static int start_kthread(unsigned int cpu) barrier(); return 0; } - snprintf(comm, 24, "osnoise/%d", cpu); } @@ -1846,6 +2037,11 @@ static int start_per_cpu_kthreads(void) int retval = 0; int cpu; + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (timerlat_enabled()) + return 0; + } + cpus_read_lock(); /* * Run only on online CPUs in which osnoise is allowed to run. @@ -2186,6 +2382,223 @@ err_free: return err; } +#ifdef CONFIG_TIMERLAT_TRACER +static int timerlat_fd_open(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + long cpu = (long) inode->i_cdev; + + mutex_lock(&interface_lock); + + /* + * This file is accessible only if timerlat is enabled, and + * NO_OSNOISE_WORKLOAD is set. + */ + if (!timerlat_enabled() || test_bit(OSN_WORKLOAD, &osnoise_options)) { + mutex_unlock(&interface_lock); + return -EINVAL; + } + + migrate_disable(); + + osn_var = this_cpu_osn_var(); + + /* + * The osn_var->pid holds the single access to this file. + */ + if (osn_var->pid) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EBUSY; + } + + /* + * timerlat tracer is a per-cpu tracer. Check if the user-space too + * is pinned to a single CPU. The tracer laters monitor if the task + * migrates and then disables tracer if it does. However, it is + * worth doing this basic acceptance test to avoid obviusly wrong + * setup. + */ + if (current->nr_cpus_allowed > 1 || cpu != smp_processor_id()) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EPERM; + } + + /* + * From now on, it is good to go. + */ + file->private_data = inode->i_cdev; + + get_task_struct(current); + + osn_var->kthread = current; + osn_var->pid = current->pid; + + /* + * Setup is done. + */ + mutex_unlock(&interface_lock); + + tlat = this_cpu_tmr_var(); + tlat->count = 0; + + migrate_enable(); + return 0; +}; + +/* + * timerlat_fd_read - Read function for "timerlat_fd" file + * @file: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints 1 on timerlat, the number of interferences on osnoise, -1 on error. + */ +static ssize_t +timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, + loff_t *ppos) +{ + long cpu = (long) file->private_data; + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + struct timerlat_sample s; + s64 diff; + u64 now; + + migrate_disable(); + + tlat = this_cpu_tmr_var(); + + /* + * While in user-space, the thread is migratable. There is nothing + * we can do about it. + * So, if the thread is running on another CPU, stop the machinery. + */ + if (cpu == smp_processor_id()) { + if (tlat->uthread_migrate) { + migrate_enable(); + return -EINVAL; + } + } else { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user thread migrate\n"); + osnoise_stop_tracing(); + migrate_enable(); + return -EINVAL; + } + + osn_var = this_cpu_osn_var(); + + /* + * The timerlat in user-space runs in a different order: + * the read() starts from the execution of the previous occurrence, + * sleeping for the next occurrence. + * + * So, skip if we are entering on read() before the first wakeup + * from timerlat IRQ: + */ + if (likely(osn_var->sampling)) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_URET; + + trace_timerlat_sample(&s); + + notify_new_max_latency(diff); + + tlat->tracing_thread = false; + if (osnoise_data.stop_tracing_total) + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + } else { + tlat->tracing_thread = false; + tlat->kthread = current; + + hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + tlat->timer.function = timerlat_irq; + + /* Annotate now to drift new period */ + tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); + + osn_var->sampling = 1; + } + + /* wait for the next period */ + wait_next_period(tlat); + + /* This is the wakeup from this cycle */ + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_CONTEXT; + + trace_timerlat_sample(&s); + + if (osnoise_data.stop_tracing_total) { + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { + timerlat_dump_stack(time_to_us(diff)); + notify_new_max_latency(diff); + osnoise_stop_tracing(); + } + } + +out: + migrate_enable(); + return 0; +} + +static int timerlat_fd_release(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat_var; + long cpu = (long) file->private_data; + + migrate_disable(); + mutex_lock(&interface_lock); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + + hrtimer_cancel(&tlat_var->timer); + memset(tlat_var, 0, sizeof(*tlat_var)); + + osn_var->sampling = 0; + osn_var->pid = 0; + + /* + * We are leaving, not being stopped... see stop_kthread(); + */ + if (osn_var->kthread) { + put_task_struct(osn_var->kthread); + osn_var->kthread = NULL; + } + + mutex_unlock(&interface_lock); + migrate_enable(); + return 0; +} +#endif + /* * osnoise/runtime_us: cannot be greater than the period. */ @@ -2249,6 +2662,13 @@ static struct trace_min_max_param timerlat_period = { .max = &timerlat_max_period, .min = &timerlat_min_period, }; + +static const struct file_operations timerlat_fd_fops = { + .open = timerlat_fd_open, + .read = timerlat_fd_read, + .release = timerlat_fd_release, + .llseek = generic_file_llseek, +}; #endif static const struct file_operations cpus_fops = { @@ -2286,18 +2706,63 @@ static int init_timerlat_stack_tracefs(struct dentry *top_dir) } #endif /* CONFIG_STACKTRACE */ +static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) +{ + struct dentry *timerlat_fd; + struct dentry *per_cpu; + struct dentry *cpu_dir; + char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */ + long cpu; + + /* + * Why not using tracing instance per_cpu/ dir? + * + * Because osnoise/timerlat have a single workload, having + * multiple files like these are wast of memory. + */ + per_cpu = tracefs_create_dir("per_cpu", top_dir); + if (!per_cpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + snprintf(cpu_str, 30, "cpu%ld", cpu); + cpu_dir = tracefs_create_dir(cpu_str, per_cpu); + if (!cpu_dir) + goto out_clean; + + timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ, + cpu_dir, NULL, &timerlat_fd_fops); + if (!timerlat_fd) + goto out_clean; + + /* Record the CPU */ + d_inode(timerlat_fd)->i_cdev = (void *)(cpu); + } + + return 0; + +out_clean: + tracefs_remove(per_cpu); + return -ENOMEM; +} + /* * init_timerlat_tracefs - A function to initialize the timerlat interface files */ static int init_timerlat_tracefs(struct dentry *top_dir) { struct dentry *tmp; + int retval; tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, &timerlat_period, &trace_min_max_fops); if (!tmp) return -ENOMEM; + retval = osnoise_create_cpu_timerlat_fd(top_dir); + if (retval) + return retval; + return init_timerlat_stack_tracefs(top_dir); } #else /* CONFIG_TIMERLAT_TRACER */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 15f05faaae44..db575094c498 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -847,7 +847,7 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c int ret; void *pos; - list_for_each_entry(field, head, link) { + list_for_each_entry_reverse(field, head, link) { trace_seq_printf(&iter->seq, " %s=", field->name); if (field->offset + field->size > iter->ent_size) { trace_seq_puts(&iter->seq, "<OVERFLOW>"); @@ -1446,6 +1446,8 @@ static struct trace_event trace_osnoise_event = { }; /* TRACE_TIMERLAT */ + +static char *timerlat_lat_context[] = {"irq", "thread", "user-ret"}; static enum print_line_t trace_timerlat_print(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -1458,7 +1460,7 @@ trace_timerlat_print(struct trace_iterator *iter, int flags, trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n", field->seqnum, - field->context ? "thread" : "irq", + timerlat_lat_context[field->context], field->timer_latency); return trace_handle_return(s); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2d2616678295..7ba371da0926 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -11,6 +11,8 @@ */ #define pr_fmt(fmt) "trace_probe: " fmt +#include <linux/bpf.h> + #include "trace_probe.h" #undef C @@ -254,7 +256,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, trace_probe_log_err(offset, GROUP_TOO_LONG); return -EINVAL; } - strlcpy(buf, event, slash - event + 1); + strscpy(buf, event, slash - event + 1); if (!is_good_system_name(buf)) { trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; @@ -283,69 +285,354 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return 0; } +static int parse_trace_event_arg(char *arg, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + struct ftrace_event_field *field; + struct list_head *head; + + head = trace_get_fields(ctx->event); + list_for_each_entry(field, head, link) { + if (!strcmp(arg, field->name)) { + code->op = FETCH_OP_TP_ARG; + code->data = field; + return 0; + } + } + return -ENOENT; +} + +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + +static struct btf *traceprobe_get_btf(void) +{ + struct btf *btf = bpf_get_btf_vmlinux(); + + if (IS_ERR_OR_NULL(btf)) + return NULL; + + return btf; +} + +static u32 btf_type_int(const struct btf_type *t) +{ + return *(u32 *)(t + 1); +} + +static const char *type_from_btf_id(struct btf *btf, s32 id) +{ + const struct btf_type *t; + u32 intdata; + s32 tid; + + /* TODO: const char * could be converted as a string */ + t = btf_type_skip_modifiers(btf, id, &tid); + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_ENUM: + /* enum is "int", so convert to "s32" */ + return "s32"; + case BTF_KIND_ENUM64: + return "s64"; + case BTF_KIND_PTR: + /* pointer will be converted to "x??" */ + if (IS_ENABLED(CONFIG_64BIT)) + return "x64"; + else + return "x32"; + case BTF_KIND_INT: + intdata = btf_type_int(t); + if (BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) { + switch (BTF_INT_BITS(intdata)) { + case 8: + return "s8"; + case 16: + return "s16"; + case 32: + return "s32"; + case 64: + return "s64"; + } + } else { /* unsigned */ + switch (BTF_INT_BITS(intdata)) { + case 8: + return "u8"; + case 16: + return "u16"; + case 32: + return "u32"; + case 64: + return "u64"; + } + } + } + /* TODO: support other types */ + + return NULL; +} + +static const struct btf_type *find_btf_func_proto(const char *funcname) +{ + struct btf *btf = traceprobe_get_btf(); + const struct btf_type *t; + s32 id; + + if (!btf || !funcname) + return ERR_PTR(-EINVAL); + + id = btf_find_by_name_kind(btf, funcname, BTF_KIND_FUNC); + if (id <= 0) + return ERR_PTR(-ENOENT); + + /* Get BTF_KIND_FUNC type */ + t = btf_type_by_id(btf, id); + if (!btf_type_is_func(t)) + return ERR_PTR(-ENOENT); + + /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return ERR_PTR(-ENOENT); + + return t; +} + +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, + bool tracepoint) +{ + const struct btf_param *param; + const struct btf_type *t; + + if (!funcname || !nr) + return ERR_PTR(-EINVAL); + + t = find_btf_func_proto(funcname); + if (IS_ERR(t)) + return (const struct btf_param *)t; + + *nr = btf_type_vlen(t); + param = (const struct btf_param *)(t + 1); + + /* Hide the first 'data' argument of tracepoint */ + if (tracepoint) { + (*nr)--; + param++; + } + + if (*nr > 0) + return param; + else + return NULL; +} + +static int parse_btf_arg(const char *varname, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const struct btf_param *params; + int i; + + if (!btf) { + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; + } + + if (WARN_ON_ONCE(!ctx->funcname)) + return -EINVAL; + + if (!ctx->params) { + params = find_btf_func_param(ctx->funcname, &ctx->nr_params, + ctx->flags & TPARG_FL_TPOINT); + if (IS_ERR(params)) { + trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); + return PTR_ERR(params); + } + ctx->params = params; + } else + params = ctx->params; + + for (i = 0; i < ctx->nr_params; i++) { + const char *name = btf_name_by_offset(btf, params[i].name_off); + + if (name && !strcmp(name, varname)) { + code->op = FETCH_OP_ARG; + if (ctx->flags & TPARG_FL_TPOINT) + code->param = i + 1; + else + code->param = i; + return 0; + } + } + trace_probe_log_err(ctx->offset, NO_BTFARG); + return -ENOENT; +} + +static const struct fetch_type *parse_btf_arg_type(int arg_idx, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *typestr = NULL; + + if (btf && ctx->params) { + if (ctx->flags & TPARG_FL_TPOINT) + arg_idx--; + typestr = type_from_btf_id(btf, ctx->params[arg_idx].type); + } + + return find_fetch_type(typestr, ctx->flags); +} + +static const struct fetch_type *parse_btf_retval_type( + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *typestr = NULL; + const struct btf_type *t; + + if (btf && ctx->funcname) { + t = find_btf_func_proto(ctx->funcname); + if (!IS_ERR(t)) + typestr = type_from_btf_id(btf, t->type); + } + + return find_fetch_type(typestr, ctx->flags); +} + +static bool is_btf_retval_void(const char *funcname) +{ + const struct btf_type *t; + + t = find_btf_func_proto(funcname); + if (IS_ERR(t)) + return false; + + return t->type == 0; +} +#else +static struct btf *traceprobe_get_btf(void) +{ + return NULL; +} + +static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, + bool tracepoint) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static int parse_btf_arg(const char *varname, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; +} + +#define parse_btf_arg_type(idx, ctx) \ + find_fetch_type(NULL, ctx->flags) + +#define parse_btf_retval_type(ctx) \ + find_fetch_type(NULL, ctx->flags) + +#define is_btf_retval_void(funcname) (false) + +#endif + #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, unsigned int flags, int offs) + struct fetch_insn *code, + struct traceprobe_parse_context *ctx) { unsigned long param; + int err = TP_ERR_BAD_VAR; int ret = 0; int len; - if (flags & TPARG_FL_TPOINT) { + if (ctx->flags & TPARG_FL_TEVENT) { if (code->data) return -EFAULT; - code->data = kstrdup(arg, GFP_KERNEL); - if (!code->data) - return -ENOMEM; - code->op = FETCH_OP_TP_ARG; - } else if (strcmp(arg, "retval") == 0) { - if (flags & TPARG_FL_RETURN) { + ret = parse_trace_event_arg(arg, code, ctx); + if (!ret) + return 0; + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_COMM; + return 0; + } + /* backward compatibility */ + ctx->offset = 0; + goto inval; + } + + if (strcmp(arg, "retval") == 0) { + if (ctx->flags & TPARG_FL_RETURN) { + if ((ctx->flags & TPARG_FL_KERNEL) && + is_btf_retval_void(ctx->funcname)) { + err = TP_ERR_NO_RETVAL; + goto inval; + } code->op = FETCH_OP_RETVAL; - } else { - trace_probe_log_err(offs, RETVAL_ON_PROBE); - ret = -EINVAL; + return 0; } - } else if ((len = str_has_prefix(arg, "stack"))) { + err = TP_ERR_RETVAL_ON_PROBE; + goto inval; + } + + len = str_has_prefix(arg, "stack"); + if (len) { + if (arg[len] == '\0') { code->op = FETCH_OP_STACKP; - } else if (isdigit(arg[len])) { + return 0; + } + + if (isdigit(arg[len])) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret) { - goto inval_var; - } else if ((flags & TPARG_FL_KERNEL) && - param > PARAM_MAX_STACK) { - trace_probe_log_err(offs, BAD_STACK_NUM); - ret = -EINVAL; - } else { - code->op = FETCH_OP_STACK; - code->param = (unsigned int)param; + if (ret) + goto inval; + + if ((ctx->flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK) { + err = TP_ERR_BAD_STACK_NUM; + goto inval; } - } else - goto inval_var; - } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_STACK; + code->param = (unsigned int)param; + return 0; + } + goto inval; + } + + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; + return 0; + } + #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API - } else if (((flags & TPARG_FL_MASK) == - (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && - (len = str_has_prefix(arg, "arg"))) { + len = str_has_prefix(arg, "arg"); + if (len && tparg_is_function_entry(ctx->flags)) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret) { - goto inval_var; - } else if (!param || param > PARAM_MAX_STACK) { - trace_probe_log_err(offs, BAD_ARG_NUM); - return -EINVAL; + if (ret) + goto inval; + + if (!param || param > PARAM_MAX_STACK) { + err = TP_ERR_BAD_ARG_NUM; + goto inval; } + code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; + /* + * The tracepoint probe will probe a stub function, and the + * first parameter of the stub is a dummy and should be ignored. + */ + if (ctx->flags & TPARG_FL_TPOINT) + code->param++; + return 0; + } #endif - } else - goto inval_var; - return ret; - -inval_var: - trace_probe_log_err(offs, BAD_VAR); +inval: + __trace_probe_log_err(ctx->offset, err); return -EINVAL; } @@ -378,7 +665,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) static int parse_probe_arg(char *arg, const struct fetch_type *type, struct fetch_insn **pcode, struct fetch_insn *end, - unsigned int flags, int offs) + struct traceprobe_parse_context *ctx) { struct fetch_insn *code = *pcode; unsigned long param; @@ -389,13 +676,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, flags, offs); + ret = parse_probe_vars(arg + 1, type, code, ctx); break; case '%': /* named register */ - if (flags & TPARG_FL_TPOINT) { - /* eprobes do not handle registers */ - trace_probe_log_err(offs, BAD_VAR); + if (ctx->flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) { + /* eprobe and fprobe do not handle registers */ + trace_probe_log_err(ctx->offset, BAD_VAR); break; } ret = regs_query_register_offset(arg + 1); @@ -404,14 +691,14 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->param = (unsigned int)ret; ret = 0; } else - trace_probe_log_err(offs, BAD_REG_NAME); + trace_probe_log_err(ctx->offset, BAD_REG_NAME); break; case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); if (ret) { - trace_probe_log_err(offs, BAD_MEM_ADDR); + trace_probe_log_err(ctx->offset, BAD_MEM_ADDR); break; } /* load address */ @@ -419,13 +706,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (flags & TPARG_FL_KERNEL) { - trace_probe_log_err(offs, FILE_ON_KPROBE); + if (ctx->flags & TPARG_FL_KERNEL) { + trace_probe_log_err(ctx->offset, FILE_ON_KPROBE); return -EINVAL; } ret = kstrtol(arg + 2, 0, &offset); if (ret) { - trace_probe_log_err(offs, BAD_FILE_OFFS); + trace_probe_log_err(ctx->offset, BAD_FILE_OFFS); break; } @@ -433,8 +720,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!(flags & TPARG_FL_KERNEL)) { - trace_probe_log_err(offs, SYM_ON_UPROBE); + if (!(ctx->flags & TPARG_FL_KERNEL)) { + trace_probe_log_err(ctx->offset, SYM_ON_UPROBE); return -EINVAL; } /* Preserve symbol for updating */ @@ -443,7 +730,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, if (!code->data) return -ENOMEM; if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } code->op = FETCH_OP_IMM; @@ -451,7 +738,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } /* These are fetching from memory */ if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } *pcode = code; @@ -470,36 +757,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type, arg++; /* Skip '+', because kstrtol() rejects it. */ tmp = strchr(arg, '('); if (!tmp) { - trace_probe_log_err(offs, DEREF_NEED_BRACE); + trace_probe_log_err(ctx->offset, DEREF_NEED_BRACE); return -EINVAL; } *tmp = '\0'; ret = kstrtol(arg, 0, &offset); if (ret) { - trace_probe_log_err(offs, BAD_DEREF_OFFS); + trace_probe_log_err(ctx->offset, BAD_DEREF_OFFS); break; } - offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); + ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); arg = tmp + 1; tmp = strrchr(arg, ')'); if (!tmp) { - trace_probe_log_err(offs + strlen(arg), + trace_probe_log_err(ctx->offset + strlen(arg), DEREF_OPEN_BRACE); return -EINVAL; } else { - const struct fetch_type *t2 = find_fetch_type(NULL, flags); + const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags); + int cur_offs = ctx->offset; *tmp = '\0'; - ret = parse_probe_arg(arg, t2, &code, end, flags, offs); + ret = parse_probe_arg(arg, t2, &code, end, ctx); if (ret) break; + ctx->offset = cur_offs; if (code->op == FETCH_OP_COMM || code->op == FETCH_OP_DATA) { - trace_probe_log_err(offs, COMM_CANT_DEREF); + trace_probe_log_err(ctx->offset, COMM_CANT_DEREF); return -EINVAL; } if (++code == end) { - trace_probe_log_err(offs, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); return -EINVAL; } *pcode = code; @@ -510,7 +799,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '\\': /* Immediate value */ if (arg[1] == '"') { /* Immediate string */ - ret = __parse_imm_string(arg + 2, &tmp, offs + 2); + ret = __parse_imm_string(arg + 2, &tmp, ctx->offset + 2); if (ret) break; code->op = FETCH_OP_DATA; @@ -518,15 +807,24 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } else { ret = str_to_immediate(arg + 1, &code->immediate); if (ret) - trace_probe_log_err(offs + 1, BAD_IMM); + trace_probe_log_err(ctx->offset + 1, BAD_IMM); else code->op = FETCH_OP_IMM; } break; + default: + if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */ + if (!tparg_is_function_entry(ctx->flags)) { + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EINVAL; + } + ret = parse_btf_arg(arg, code, ctx); + break; + } } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ - trace_probe_log_err(offs, BAD_FETCH_ARG); + trace_probe_log_err(ctx->offset, BAD_FETCH_ARG); ret = -EINVAL; } return ret; @@ -571,12 +869,13 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, - struct probe_arg *parg, unsigned int flags, int offset) + struct probe_arg *parg, + struct traceprobe_parse_context *ctx) { struct fetch_insn *code, *scode, *tmp = NULL; char *t, *t2, *t3; - char *arg; int ret, len; + char *arg; arg = kstrdup(argv, GFP_KERNEL); if (!arg) @@ -585,10 +884,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, ret = -EINVAL; len = strlen(arg); if (len > MAX_ARGSTR_LEN) { - trace_probe_log_err(offset, ARG_TOO_LONG); + trace_probe_log_err(ctx->offset, ARG_TOO_LONG); goto out; } else if (len == 0) { - trace_probe_log_err(offset, NO_ARG_BODY); + trace_probe_log_err(ctx->offset, NO_ARG_BODY); goto out; } @@ -606,23 +905,24 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, *t2++ = '\0'; t3 = strchr(t2, ']'); if (!t3) { - offset += t2 + strlen(t2) - arg; - trace_probe_log_err(offset, + int offs = t2 + strlen(t2) - arg; + + trace_probe_log_err(ctx->offset + offs, ARRAY_NO_CLOSE); goto out; } else if (t3[1] != '\0') { - trace_probe_log_err(offset + t3 + 1 - arg, + trace_probe_log_err(ctx->offset + t3 + 1 - arg, BAD_ARRAY_SUFFIX); goto out; } *t3 = '\0'; if (kstrtouint(t2, 0, &parg->count) || !parg->count) { - trace_probe_log_err(offset + t2 - arg, + trace_probe_log_err(ctx->offset + t2 - arg, BAD_ARRAY_NUM); goto out; } if (parg->count > MAX_ARRAY_LEN) { - trace_probe_log_err(offset + t2 - arg, + trace_probe_log_err(ctx->offset + t2 - arg, ARRAY_TOO_BIG); goto out; } @@ -633,17 +933,17 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, * Since $comm and immediate string can not be dereferenced, * we can find those by strcmp. But ignore for eprobes. */ - if (!(flags & TPARG_FL_TPOINT) && + if (!(ctx->flags & TPARG_FL_TEVENT) && (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; - parg->type = find_fetch_type("string", flags); + parg->type = find_fetch_type("string", ctx->flags); } else - parg->type = find_fetch_type(t, flags); + parg->type = find_fetch_type(t, ctx->flags); if (!parg->type) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE); goto out; } parg->offset = *size; @@ -665,10 +965,18 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - flags, offset); + ctx); if (ret) goto fail; + /* Update storing type if BTF is available */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && !t) { + if (code->op == FETCH_OP_ARG) + parg->type = parse_btf_arg_type(code->param, ctx); + else if (code->op == FETCH_OP_RETVAL) + parg->type = parse_btf_retval_type(ctx); + } + ret = -EINVAL; /* Store operation */ if (parg->type->is_string) { @@ -676,7 +984,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK && code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG && code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_SYMSTRING); goto fail; } @@ -684,7 +992,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } @@ -703,7 +1011,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, */ code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } } @@ -726,7 +1034,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, } else { code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } code->op = FETCH_OP_ST_RAW; @@ -737,7 +1045,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (t != NULL) { ret = __parse_bitfield_probe_arg(t, parg->type, &code); if (ret) { - trace_probe_log_err(offset + t - arg, BAD_BITFIELD); + trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD); goto fail; } } @@ -747,13 +1055,13 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (scode->op != FETCH_OP_ST_MEM && scode->op != FETCH_OP_ST_STRING && scode->op != FETCH_OP_ST_USTRING) { - trace_probe_log_err(offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } code++; if (code->op != FETCH_OP_NOP) { - trace_probe_log_err(offset, TOO_MANY_OPS); + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -801,8 +1109,35 @@ static int traceprobe_conflict_field_name(const char *name, return 0; } +static char *generate_probe_arg_name(const char *arg, int idx) +{ + char *name = NULL; + const char *end; + + /* + * If argument name is omitted, try arg as a name (BTF variable) + * or "argN". + */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) { + end = strchr(arg, ':'); + if (!end) + end = arg + strlen(arg); + + name = kmemdup_nul(arg, end - arg, GFP_KERNEL); + if (!name || !is_good_name(name)) { + kfree(name); + name = NULL; + } + } + + if (!name) + name = kasprintf(GFP_KERNEL, "arg%d", idx + 1); + + return name; +} + int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, - unsigned int flags) + struct traceprobe_parse_context *ctx) { struct probe_arg *parg = &tp->args[i]; const char *body; @@ -822,8 +1157,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { - /* If argument name is omitted, set "argN" */ - parg->name = kasprintf(GFP_KERNEL, "arg%d", i + 1); + parg->name = generate_probe_arg_name(arg, i); body = arg; } if (!parg->name) @@ -837,9 +1171,9 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, trace_probe_log_err(0, USED_ARG_NAME); return -EINVAL; } + ctx->offset = body - arg; /* Parse fetch argument */ - return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags, - body - arg); + return traceprobe_parse_probe_arg_body(body, &tp->size, parg, ctx); } void traceprobe_free_probe_arg(struct probe_arg *arg) @@ -858,6 +1192,151 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) kfree(arg->fmt); } +static int argv_has_var_arg(int argc, const char *argv[], int *args_idx, + struct traceprobe_parse_context *ctx) +{ + int i, found = 0; + + for (i = 0; i < argc; i++) + if (str_has_prefix(argv[i], "$arg")) { + trace_probe_log_set_index(i + 2); + + if (!tparg_is_function_entry(ctx->flags)) { + trace_probe_log_err(0, NOFENTRY_ARGS); + return -EINVAL; + } + + if (isdigit(argv[i][4])) { + found = 1; + continue; + } + + if (argv[i][4] != '*') { + trace_probe_log_err(0, BAD_VAR); + return -EINVAL; + } + + if (*args_idx >= 0 && *args_idx < argc) { + trace_probe_log_err(0, DOUBLE_ARGS); + return -EINVAL; + } + found = 1; + *args_idx = i; + } + + return found; +} + +static int sprint_nth_btf_arg(int idx, const char *type, + char *buf, int bufsize, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = traceprobe_get_btf(); + const char *name; + int ret; + + if (idx >= ctx->nr_params) { + trace_probe_log_err(0, NO_BTFARG); + return -ENOENT; + } + name = btf_name_by_offset(btf, ctx->params[idx].name_off); + if (!name) { + trace_probe_log_err(0, NO_BTF_ENTRY); + return -ENOENT; + } + ret = snprintf(buf, bufsize, "%s%s", name, type); + if (ret >= bufsize) { + trace_probe_log_err(0, ARGS_2LONG); + return -E2BIG; + } + return ret; +} + +/* Return new_argv which must be freed after use */ +const char **traceprobe_expand_meta_args(int argc, const char *argv[], + int *new_argc, char *buf, int bufsize, + struct traceprobe_parse_context *ctx) +{ + const struct btf_param *params = NULL; + int i, j, n, used, ret, args_idx = -1; + const char **new_argv = NULL; + int nr_params; + + ret = argv_has_var_arg(argc, argv, &args_idx, ctx); + if (ret < 0) + return ERR_PTR(ret); + + if (!ret) { + *new_argc = argc; + return NULL; + } + + params = find_btf_func_param(ctx->funcname, &nr_params, + ctx->flags & TPARG_FL_TPOINT); + if (IS_ERR(params)) { + if (args_idx != -1) { + /* $arg* requires BTF info */ + trace_probe_log_err(0, NOSUP_BTFARG); + return (const char **)params; + } + *new_argc = argc; + return NULL; + } + ctx->params = params; + ctx->nr_params = nr_params; + + if (args_idx >= 0) + *new_argc = argc + ctx->nr_params - 1; + else + *new_argc = argc; + + new_argv = kcalloc(*new_argc, sizeof(char *), GFP_KERNEL); + if (!new_argv) + return ERR_PTR(-ENOMEM); + + used = 0; + for (i = 0, j = 0; i < argc; i++) { + trace_probe_log_set_index(i + 2); + if (i == args_idx) { + for (n = 0; n < nr_params; n++) { + ret = sprint_nth_btf_arg(n, "", buf + used, + bufsize - used, ctx); + if (ret < 0) + goto error; + + new_argv[j++] = buf + used; + used += ret + 1; + } + continue; + } + + if (str_has_prefix(argv[i], "$arg")) { + char *type = NULL; + + n = simple_strtoul(argv[i] + 4, &type, 10); + if (type && !(*type == ':' || *type == '\0')) { + trace_probe_log_err(0, BAD_VAR); + ret = -ENOENT; + goto error; + } + /* Note: $argN starts from $arg1 */ + ret = sprint_nth_btf_arg(n - 1, type, buf + used, + bufsize - used, ctx); + if (ret < 0) + goto error; + new_argv[j++] = buf + used; + used += ret + 1; + } else + new_argv[j++] = argv[i]; + } + + return new_argv; + +error: + kfree(new_argv); + return ERR_PTR(ret); +} + int traceprobe_update_arg(struct probe_arg *arg) { struct fetch_insn *code = arg->code; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index ef8ed3b65d05..01ea148723de 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -23,6 +23,7 @@ #include <linux/limits.h> #include <linux/uaccess.h> #include <linux/bitops.h> +#include <linux/btf.h> #include <asm/bitsperlong.h> #include "trace.h" @@ -32,7 +33,9 @@ #define MAX_ARGSTR_LEN 63 #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 +#define MAX_BTF_ARGS_LEN 128 #define MAX_STRING_SIZE PATH_MAX +#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -308,7 +311,7 @@ trace_probe_primary_from_call(struct trace_event_call *call) { struct trace_probe_event *tpe = trace_probe_event_from_call(call); - return list_first_entry(&tpe->probes, struct trace_probe, list); + return list_first_entry_or_null(&tpe->probes, struct trace_probe, list); } static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp) @@ -357,15 +360,42 @@ int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_a #define trace_probe_for_each_link_rcu(pos, tp) \ list_for_each_entry_rcu(pos, &(tp)->event->files, list) +/* + * The flags used for parsing trace_probe arguments. + * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TEVENT are mutually exclusive. + * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive. + * TPARG_FL_FPROBE and TPARG_FL_TPOINT are optional but it should be with + * TPARG_FL_KERNEL. + */ #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -#define TPARG_FL_TPOINT BIT(3) +#define TPARG_FL_TEVENT BIT(3) #define TPARG_FL_USER BIT(4) -#define TPARG_FL_MASK GENMASK(4, 0) +#define TPARG_FL_FPROBE BIT(5) +#define TPARG_FL_TPOINT BIT(6) +#define TPARG_FL_LOC_MASK GENMASK(4, 0) + +static inline bool tparg_is_function_entry(unsigned int flags) +{ + return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY); +} + +struct traceprobe_parse_context { + struct trace_event_call *event; + const struct btf_param *params; + s32 nr_params; + const char *funcname; + unsigned int flags; + int offset; +}; extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, - const char *argv, unsigned int flags); + const char *argv, + struct traceprobe_parse_context *ctx); +const char **traceprobe_expand_meta_args(int argc, const char *argv[], + int *new_argc, char *buf, int bufsize, + struct traceprobe_parse_context *ctx); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); @@ -404,11 +434,12 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \ C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \ C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \ - C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \ + C(BAD_MAXACT_TYPE, "Maxactive is only for function exit"), \ C(BAD_MAXACT, "Invalid maxactive number"), \ C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(NO_TRACEPOINT, "Tracepoint is not found"), \ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ @@ -418,6 +449,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ C(EVENT_EXIST, "Given group/event name is already used by another event"), \ C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ + C(NO_RETVAL, "This function returns 'void' type"), \ C(BAD_STACK_NUM, "Invalid stack number"), \ C(BAD_ARG_NUM, "Invalid argument number"), \ C(BAD_VAR, "Invalid $-valiable specified"), \ @@ -456,7 +488,14 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ C(BAD_ATTACH_ARG, "Attached event does not have this field"),\ - C(NO_EP_FILTER, "No filter rule after 'if'"), + C(NO_EP_FILTER, "No filter rule after 'if'"), \ + C(NOSUP_BTFARG, "BTF is not available or not supported"), \ + C(NO_BTFARG, "This variable is not found at this probe point"),\ + C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \ + C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\ + C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \ + C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \ + C(ARGS_2LONG, "$arg* failed because the argument list is too long"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a931d9aaea26..529590499b1f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -848,6 +848,12 @@ trace_selftest_startup_function_graph(struct tracer *trace, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * These tests can take some time to run. Make sure on non PREEMPT + * kernels, we do not trigger the softlockup detector. + */ + cond_resched(); + tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); @@ -869,6 +875,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, if (ret) goto out; + cond_resched(); + ret = register_ftrace_graph(&fgraph_ops); if (ret) { warn_failed_init_tracer(trace, ret); @@ -891,6 +899,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, if (ret) goto out; + cond_resched(); + tracing_start(); if (!ret && !count) { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b92e34ff0c8..fa09b33ee731 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -686,10 +686,12 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct traceprobe_parse_context ctx = { + .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, + }; + trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], - (is_return ? TPARG_FL_RETURN : 0) | - TPARG_FL_USER); + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); if (ret) goto error; } diff --git a/kernel/umh.c b/kernel/umh.c index 60aa9e764a38..1b13c5d34624 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -494,6 +494,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) } EXPORT_SYMBOL(call_usermodehelper); +#if defined(CONFIG_SYSCTL) static int proc_cap_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -544,7 +545,7 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } -struct ctl_table usermodehelper_table[] = { +static struct ctl_table usermodehelper_table[] = { { .procname = "bset", .data = &usermodehelper_bset, @@ -561,3 +562,11 @@ struct ctl_table usermodehelper_table[] = { }, { } }; + +static int __init init_umh_sysctls(void) +{ + register_sysctl_init("kernel/usermodehelper", usermodehelper_table); + return 0; +} +early_initcall(init_umh_sysctls); +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index b7cbd66f889e..da35e5b7f047 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -12,58 +12,90 @@ enum vhost_task_flags { VHOST_TASK_FLAGS_STOP, }; +struct vhost_task { + bool (*fn)(void *data); + void *data; + struct completion exited; + unsigned long flags; + struct task_struct *task; +}; + static int vhost_task_fn(void *data) { struct vhost_task *vtsk = data; - int ret; + bool dead = false; + + for (;;) { + bool did_work; + + if (!dead && signal_pending(current)) { + struct ksignal ksig; + /* + * Calling get_signal will block in SIGSTOP, + * or clear fatal_signal_pending, but remember + * what was set. + * + * This thread won't actually exit until all + * of the file descriptors are closed, and + * the release function is called. + */ + dead = get_signal(&ksig); + if (dead) + clear_thread_flag(TIF_SIGPENDING); + } + + /* mb paired w/ vhost_task_stop */ + set_current_state(TASK_INTERRUPTIBLE); + + if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) { + __set_current_state(TASK_RUNNING); + break; + } + + did_work = vtsk->fn(vtsk->data); + if (!did_work) + schedule(); + } - ret = vtsk->fn(vtsk->data); complete(&vtsk->exited); - do_exit(ret); + do_exit(0); } /** + * vhost_task_wake - wakeup the vhost_task + * @vtsk: vhost_task to wake + * + * wake up the vhost_task worker thread + */ +void vhost_task_wake(struct vhost_task *vtsk) +{ + wake_up_process(vtsk->task); +} +EXPORT_SYMBOL_GPL(vhost_task_wake); + +/** * vhost_task_stop - stop a vhost_task * @vtsk: vhost_task to stop * - * Callers must call vhost_task_should_stop and return from their worker - * function when it returns true; + * vhost_task_fn ensures the worker thread exits after + * VHOST_TASK_FLAGS_SOP becomes true. */ void vhost_task_stop(struct vhost_task *vtsk) { - pid_t pid = vtsk->task->pid; - set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); - wake_up_process(vtsk->task); + vhost_task_wake(vtsk); /* * Make sure vhost_task_fn is no longer accessing the vhost_task before - * freeing it below. If userspace crashed or exited without closing, - * then the vhost_task->task could already be marked dead so - * kernel_wait will return early. + * freeing it below. */ wait_for_completion(&vtsk->exited); - /* - * If we are just closing/removing a device and the parent process is - * not exiting then reap the task. - */ - kernel_wait4(pid, NULL, __WCLONE, NULL); kfree(vtsk); } EXPORT_SYMBOL_GPL(vhost_task_stop); /** - * vhost_task_should_stop - should the vhost task return from the work function - * @vtsk: vhost_task to stop - */ -bool vhost_task_should_stop(struct vhost_task *vtsk) -{ - return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); -} -EXPORT_SYMBOL_GPL(vhost_task_should_stop); - -/** - * vhost_task_create - create a copy of a process to be used by the kernel - * @fn: thread stack + * vhost_task_create - create a copy of a task to be used by the kernel + * @fn: vhost worker function * @arg: data to be passed to fn * @name: the thread's name * @@ -71,17 +103,17 @@ EXPORT_SYMBOL_GPL(vhost_task_should_stop); * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ -struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, +struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, const char *name) { struct kernel_clone_args args = { - .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM, + .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM | + CLONE_THREAD | CLONE_SIGHAND, .exit_signal = 0, .fn = vhost_task_fn, .name = name, .user_worker = 1, .no_files = 1, - .ignore_signals = 1, }; struct vhost_task *vtsk; struct task_struct *tsk; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index e91cb4c2833f..d0b6b390ee42 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -42,7 +42,7 @@ MODULE_AUTHOR("Red Hat, Inc."); static inline bool lock_wqueue(struct watch_queue *wqueue) { spin_lock_bh(&wqueue->lock); - if (unlikely(wqueue->defunct)) { + if (unlikely(!wqueue->pipe)) { spin_unlock_bh(&wqueue->lock); return false; } @@ -104,9 +104,6 @@ static bool post_one_notification(struct watch_queue *wqueue, unsigned int head, tail, mask, note, offset, len; bool done = false; - if (!pipe) - return false; - spin_lock_irq(&pipe->rd_wait.lock); mask = pipe->ring_size - 1; @@ -603,8 +600,11 @@ void watch_queue_clear(struct watch_queue *wqueue) rcu_read_lock(); spin_lock_bh(&wqueue->lock); - /* Prevent new notifications from being stored. */ - wqueue->defunct = true; + /* + * This pipe can be freed by callers like free_pipe_info(). + * Removing this reference also prevents new notifications. + */ + wqueue->pipe = NULL; while (!hlist_empty(&wqueue->watches)) { watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8e61f21e7e33..be38276a365f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,20 +29,18 @@ static DEFINE_MUTEX(watchdog_mutex); -#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) -# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED) -# define NMI_WATCHDOG_DEFAULT 1 +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64) +# define WATCHDOG_HARDLOCKUP_DEFAULT 1 #else -# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED) -# define NMI_WATCHDOG_DEFAULT 0 +# define WATCHDOG_HARDLOCKUP_DEFAULT 0 #endif unsigned long __read_mostly watchdog_enabled; int __read_mostly watchdog_user_enabled = 1; -int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; -int __read_mostly soft_watchdog_user_enabled = 1; +static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; +static int __read_mostly watchdog_softlockup_user_enabled = 1; int __read_mostly watchdog_thresh = 10; -static int __read_mostly nmi_watchdog_available; +static int __read_mostly watchdog_hardlockup_available; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -68,7 +66,7 @@ unsigned int __read_mostly hardlockup_panic = */ void __init hardlockup_detector_disable(void) { - nmi_watchdog_user_enabled = 0; + watchdog_hardlockup_user_enabled = 0; } static int __init hardlockup_panic_setup(char *str) @@ -78,54 +76,163 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - nmi_watchdog_user_enabled = 0; + watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) - nmi_watchdog_user_enabled = 1; + watchdog_hardlockup_user_enabled = 1; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); #endif /* CONFIG_HARDLOCKUP_DETECTOR */ -/* - * These functions can be overridden if an architecture implements its - * own hardlockup detector. - * - * watchdog_nmi_enable/disable can be implemented to start and stop when - * softlockup watchdog start and stop. The arch must select the - * SOFTLOCKUP_DETECTOR Kconfig. - */ -int __weak watchdog_nmi_enable(unsigned int cpu) +#if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER) + +static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); +static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); +static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); +static unsigned long watchdog_hardlockup_all_cpu_dumped; + +notrace void arch_touch_nmi_watchdog(void) { - hardlockup_detector_perf_enable(); - return 0; + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + raw_cpu_write(watchdog_hardlockup_touched, true); +} +EXPORT_SYMBOL(arch_touch_nmi_watchdog); + +void watchdog_hardlockup_touch_cpu(unsigned int cpu) +{ + per_cpu(watchdog_hardlockup_touched, cpu) = true; } -void __weak watchdog_nmi_disable(unsigned int cpu) +static bool is_hardlockup(unsigned int cpu) { - hardlockup_detector_perf_disable(); + int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); + + if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) + return true; + + /* + * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE + * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is + * written/read by a single CPU. + */ + per_cpu(hrtimer_interrupts_saved, cpu) = hrint; + + return false; } -/* Return 0, if a NMI watchdog is available. Error code otherwise */ -int __weak __init watchdog_nmi_probe(void) +static void watchdog_hardlockup_kick(void) { - return hardlockup_detector_perf_init(); + int new_interrupts; + + new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts)); + watchdog_buddy_check_hardlockup(new_interrupts); +} + +void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) +{ + if (per_cpu(watchdog_hardlockup_touched, cpu)) { + per_cpu(watchdog_hardlockup_touched, cpu) = false; + return; + } + + /* + * Check for a hardlockup by making sure the CPU's timer + * interrupt is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup(cpu)) { + unsigned int this_cpu = smp_processor_id(); + struct cpumask backtrace_mask; + + cpumask_copy(&backtrace_mask, cpu_online_mask); + + /* Only print hardlockups once. */ + if (per_cpu(watchdog_hardlockup_warned, cpu)) + return; + + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); + print_modules(); + print_irqtrace_events(current); + if (cpu == this_cpu) { + if (regs) + show_regs(regs); + else + dump_stack(); + cpumask_clear_cpu(cpu, &backtrace_mask); + } else { + if (trigger_single_cpu_backtrace(cpu)) + cpumask_clear_cpu(cpu, &backtrace_mask); + } + + /* + * Perform multi-CPU dump only once to avoid multiple + * hardlockups generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped)) + trigger_cpumask_backtrace(&backtrace_mask); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + + per_cpu(watchdog_hardlockup_warned, cpu) = true; + } else { + per_cpu(watchdog_hardlockup_warned, cpu) = false; + } +} + +#else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ + +static inline void watchdog_hardlockup_kick(void) { } + +#endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ + +/* + * These functions can be overridden based on the configured hardlockdup detector. + * + * watchdog_hardlockup_enable/disable can be implemented to start and stop when + * softlockup watchdog start and stop. The detector must select the + * SOFTLOCKUP_DETECTOR Kconfig. + */ +void __weak watchdog_hardlockup_enable(unsigned int cpu) { } + +void __weak watchdog_hardlockup_disable(unsigned int cpu) { } + +/* + * Watchdog-detector specific API. + * + * Return 0 when hardlockup watchdog is available, negative value otherwise. + * Note that the negative value means that a delayed probe might + * succeed later. + */ +int __weak __init watchdog_hardlockup_probe(void) +{ + return -ENODEV; } /** - * watchdog_nmi_stop - Stop the watchdog for reconfiguration + * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration * * The reconfiguration steps are: - * watchdog_nmi_stop(); + * watchdog_hardlockup_stop(); * update_variables(); - * watchdog_nmi_start(); + * watchdog_hardlockup_start(); */ -void __weak watchdog_nmi_stop(void) { } +void __weak watchdog_hardlockup_stop(void) { } /** - * watchdog_nmi_start - Start the watchdog after reconfiguration + * watchdog_hardlockup_start - Start the watchdog after reconfiguration * - * Counterpart to watchdog_nmi_stop(). + * Counterpart to watchdog_hardlockup_stop(). * * The following variables have been updated in update_variables() and * contain the currently valid configuration: @@ -133,23 +240,23 @@ void __weak watchdog_nmi_stop(void) { } * - watchdog_thresh * - watchdog_cpumask */ -void __weak watchdog_nmi_start(void) { } +void __weak watchdog_hardlockup_start(void) { } /** * lockup_detector_update_enable - Update the sysctl enable bit * - * Caller needs to make sure that the NMI/perf watchdogs are off, so this - * can't race with watchdog_nmi_disable(). + * Caller needs to make sure that the hard watchdogs are off, so this + * can't race with watchdog_hardlockup_disable(). */ static void lockup_detector_update_enable(void) { watchdog_enabled = 0; if (!watchdog_user_enabled) return; - if (nmi_watchdog_available && nmi_watchdog_user_enabled) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - if (soft_watchdog_user_enabled) - watchdog_enabled |= SOFT_WATCHDOG_ENABLED; + if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled) + watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED; + if (watchdog_softlockup_user_enabled) + watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED; } #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -179,8 +286,6 @@ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; static int __init nowatchdog_setup(char *str) @@ -192,7 +297,7 @@ __setup("nowatchdog", nowatchdog_setup); static int __init nosoftlockup_setup(char *str) { - soft_watchdog_user_enabled = 0; + watchdog_softlockup_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); @@ -306,7 +411,7 @@ static int is_softlockup(unsigned long touch_ts, unsigned long period_ts, unsigned long now) { - if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ + if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) { /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; @@ -315,22 +420,6 @@ static int is_softlockup(unsigned long touch_ts, } /* watchdog detector functions */ -bool is_hardlockup(void) -{ - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return true; - - __this_cpu_write(hrtimer_interrupts_saved, hrint); - return false; -} - -static void watchdog_interrupt_count(void) -{ - __this_cpu_inc(hrtimer_interrupts); -} - static DEFINE_PER_CPU(struct completion, softlockup_completion); static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); @@ -361,8 +450,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (!watchdog_enabled) return HRTIMER_NORESTART; - /* kick the hardlockup detector */ - watchdog_interrupt_count(); + watchdog_hardlockup_kick(); /* kick the softlockup detector */ if (completion_done(this_cpu_ptr(&softlockup_completion))) { @@ -458,7 +546,7 @@ static void watchdog_enable(unsigned int cpu) complete(done); /* - * Start the timer first to prevent the NMI watchdog triggering + * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); @@ -468,9 +556,9 @@ static void watchdog_enable(unsigned int cpu) /* Initialize timestamp */ update_touch_ts(); - /* Enable the perf event */ - if (watchdog_enabled & NMI_WATCHDOG_ENABLED) - watchdog_nmi_enable(cpu); + /* Enable the hardlockup detector */ + if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED) + watchdog_hardlockup_enable(cpu); } static void watchdog_disable(unsigned int cpu) @@ -480,11 +568,11 @@ static void watchdog_disable(unsigned int cpu) WARN_ON_ONCE(cpu != smp_processor_id()); /* - * Disable the perf event first. That prevents that a large delay - * between disabling the timer and disabling the perf event causes - * the perf NMI to detect a false positive. + * Disable the hardlockup detector first. That prevents that a large + * delay between disabling the timer and disabling the hardlockup + * detector causes a false positive. */ - watchdog_nmi_disable(cpu); + watchdog_hardlockup_disable(cpu); hrtimer_cancel(hrtimer); wait_for_completion(this_cpu_ptr(&softlockup_completion)); } @@ -540,7 +628,7 @@ int lockup_detector_offline_cpu(unsigned int cpu) static void __lockup_detector_reconfigure(void) { cpus_read_lock(); - watchdog_nmi_stop(); + watchdog_hardlockup_stop(); softlockup_stop_all(); set_sample_period(); @@ -548,7 +636,7 @@ static void __lockup_detector_reconfigure(void) if (watchdog_enabled && watchdog_thresh) softlockup_start_all(); - watchdog_nmi_start(); + watchdog_hardlockup_start(); cpus_read_unlock(); /* * Must be called outside the cpus locked section to prevent @@ -589,9 +677,9 @@ static __init void lockup_detector_setup(void) static void __lockup_detector_reconfigure(void) { cpus_read_lock(); - watchdog_nmi_stop(); + watchdog_hardlockup_stop(); lockup_detector_update_enable(); - watchdog_nmi_start(); + watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) @@ -646,14 +734,14 @@ static void proc_watchdog_update(void) /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * - * caller | table->data points to | 'which' - * -------------------|----------------------------|-------------------------- - * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED | - * | | SOFT_WATCHDOG_ENABLED - * -------------------|----------------------------|-------------------------- - * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED - * -------------------|----------------------------|-------------------------- - * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED + * caller | table->data points to | 'which' + * -------------------|----------------------------------|------------------------------- + * proc_watchdog | watchdog_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED | + * | | WATCHDOG_SOFTOCKUP_ENABLED + * -------------------|----------------------------------|------------------------------- + * proc_nmi_watchdog | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED + * -------------------|----------------------------------|------------------------------- + * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -685,7 +773,8 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int proc_watchdog(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, + return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED | + WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } @@ -695,9 +784,9 @@ int proc_watchdog(struct ctl_table *table, int write, int proc_nmi_watchdog(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - if (!nmi_watchdog_available && write) + if (!watchdog_hardlockup_available && write) return -ENOTSUPP; - return proc_watchdog_common(NMI_WATCHDOG_ENABLED, + return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED, table, write, buffer, lenp, ppos); } @@ -707,7 +796,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, int proc_soft_watchdog(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, + return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } @@ -774,15 +863,6 @@ static struct ctl_table watchdog_sysctls[] = { .extra2 = (void *)&sixty, }, { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { .procname = "watchdog_cpumask", .data = &watchdog_cpumask_bits, .maxlen = NR_CPUS, @@ -792,7 +872,7 @@ static struct ctl_table watchdog_sysctls[] = { #ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", - .data = &soft_watchdog_user_enabled, + .data = &watchdog_softlockup_user_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_soft_watchdog, @@ -845,14 +925,90 @@ static struct ctl_table watchdog_sysctls[] = { {} }; +static struct ctl_table watchdog_hardlockup_sysctl[] = { + { + .procname = "nmi_watchdog", + .data = &watchdog_hardlockup_user_enabled, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_nmi_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + static void __init watchdog_sysctl_init(void) { register_sysctl_init("kernel", watchdog_sysctls); + + if (watchdog_hardlockup_available) + watchdog_hardlockup_sysctl[0].mode = 0644; + register_sysctl_init("kernel", watchdog_hardlockup_sysctl); } + #else #define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ +static void __init lockup_detector_delay_init(struct work_struct *work); +static bool allow_lockup_detector_init_retry __initdata; + +static struct work_struct detector_work __initdata = + __WORK_INITIALIZER(detector_work, lockup_detector_delay_init); + +static void __init lockup_detector_delay_init(struct work_struct *work) +{ + int ret; + + ret = watchdog_hardlockup_probe(); + if (ret) { + pr_info("Delayed init of the lockup detector failed: %d\n", ret); + pr_info("Hard watchdog permanently disabled\n"); + return; + } + + allow_lockup_detector_init_retry = false; + + watchdog_hardlockup_available = true; + lockup_detector_setup(); +} + +/* + * lockup_detector_retry_init - retry init lockup detector if possible. + * + * Retry hardlockup detector init. It is useful when it requires some + * functionality that has to be initialized later on a particular + * platform. + */ +void __init lockup_detector_retry_init(void) +{ + /* Must be called before late init calls */ + if (!allow_lockup_detector_init_retry) + return; + + schedule_work(&detector_work); +} + +/* + * Ensure that optional delayed hardlockup init is proceed before + * the init code and memory is freed. + */ +static int __init lockup_detector_check(void) +{ + /* Prevent any later retry. */ + allow_lockup_detector_init_retry = false; + + /* Make sure no work is pending. */ + flush_work(&detector_work); + + watchdog_sysctl_init(); + + return 0; + +} +late_initcall_sync(lockup_detector_check); + void __init lockup_detector_init(void) { if (tick_nohz_full_enabled()) @@ -861,8 +1017,10 @@ void __init lockup_detector_init(void) cpumask_copy(&watchdog_cpumask, housekeeping_cpumask(HK_TYPE_TIMER)); - if (!watchdog_nmi_probe()) - nmi_watchdog_available = true; + if (!watchdog_hardlockup_probe()) + watchdog_hardlockup_available = true; + else + allow_lockup_detector_init_retry = true; + lockup_detector_setup(); - watchdog_sysctl_init(); } diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c new file mode 100644 index 000000000000..34dbfe091f4b --- /dev/null +++ b/kernel/watchdog_buddy.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/kernel.h> +#include <linux/nmi.h> +#include <linux/percpu-defs.h> + +static cpumask_t __read_mostly watchdog_cpus; + +static unsigned int watchdog_next_cpu(unsigned int cpu) +{ + unsigned int next_cpu; + + next_cpu = cpumask_next(cpu, &watchdog_cpus); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(&watchdog_cpus); + + if (next_cpu == cpu) + return nr_cpu_ids; + + return next_cpu; +} + +int __init watchdog_hardlockup_probe(void) +{ + return 0; +} + +void watchdog_hardlockup_enable(unsigned int cpu) +{ + unsigned int next_cpu; + + /* + * The new CPU will be marked online before the hrtimer interrupt + * gets a chance to run on it. If another CPU tests for a + * hardlockup on the new CPU before it has run its the hrtimer + * interrupt, it will get a false positive. Touch the watchdog on + * the new CPU to delay the check for at least 3 sampling periods + * to guarantee one hrtimer has run on the new CPU. + */ + watchdog_hardlockup_touch_cpu(cpu); + + /* + * We are going to check the next CPU. Our watchdog_hrtimer + * need not be zero if the CPU has already been online earlier. + * Touch the watchdog on the next CPU to avoid false positive + * if we try to check it in less then 3 interrupts. + */ + next_cpu = watchdog_next_cpu(cpu); + if (next_cpu < nr_cpu_ids) + watchdog_hardlockup_touch_cpu(next_cpu); + + /* + * Makes sure that watchdog is touched on this CPU before + * other CPUs could see it in watchdog_cpus. The counter + * part is in watchdog_buddy_check_hardlockup(). + */ + smp_wmb(); + + cpumask_set_cpu(cpu, &watchdog_cpus); +} + +void watchdog_hardlockup_disable(unsigned int cpu) +{ + unsigned int next_cpu = watchdog_next_cpu(cpu); + + /* + * Offlining this CPU will cause the CPU before this one to start + * checking the one after this one. If this CPU just finished checking + * the next CPU and updating hrtimer_interrupts_saved, and then the + * previous CPU checks it within one sample period, it will trigger a + * false positive. Touch the watchdog on the next CPU to prevent it. + */ + if (next_cpu < nr_cpu_ids) + watchdog_hardlockup_touch_cpu(next_cpu); + + /* + * Makes sure that watchdog is touched on the next CPU before + * this CPU disappear in watchdog_cpus. The counter part is in + * watchdog_buddy_check_hardlockup(). + */ + smp_wmb(); + + cpumask_clear_cpu(cpu, &watchdog_cpus); +} + +void watchdog_buddy_check_hardlockup(int hrtimer_interrupts) +{ + unsigned int next_cpu; + + /* + * Test for hardlockups every 3 samples. The sample period is + * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over + * watchdog_thresh (over by 20%). + */ + if (hrtimer_interrupts % 3 != 0) + return; + + /* check for a hardlockup on the next CPU */ + next_cpu = watchdog_next_cpu(smp_processor_id()); + if (next_cpu >= nr_cpu_ids) + return; + + /* + * Make sure that the watchdog was touched on next CPU when + * watchdog_next_cpu() returned another one because of + * a change in watchdog_hardlockup_enable()/disable(). + */ + smp_rmb(); + + watchdog_hardlockup_check(next_cpu, NULL); +} diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_perf.c index 247bf0b1582c..8ea00c4a24b2 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_perf.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Detect hard lockups on a system + * Detect hard lockups on a system using perf * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * @@ -20,28 +20,12 @@ #include <asm/irq_regs.h> #include <linux/perf_event.h> -static DEFINE_PER_CPU(bool, hard_watchdog_warn); -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; -static unsigned long hardlockup_allcpu_dumped; static atomic_t watchdog_cpus = ATOMIC_INIT(0); -notrace void arch_touch_nmi_watchdog(void) -{ - /* - * Using __raw here because some code paths have - * preemption enabled. If preemption is enabled - * then interrupts should be enabled too, in which - * case we shouldn't have to worry about the watchdog - * going off. - */ - raw_cpu_write(watchdog_nmi_touch, true); -} -EXPORT_SYMBOL(arch_touch_nmi_watchdog); - #ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP static DEFINE_PER_CPU(ktime_t, last_timestamp); static DEFINE_PER_CPU(unsigned int, nmi_rearmed); @@ -114,61 +98,24 @@ static void watchdog_overflow_callback(struct perf_event *event, /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; - if (__this_cpu_read(watchdog_nmi_touch) == true) { - __this_cpu_write(watchdog_nmi_touch, false); - return; - } - if (!watchdog_check_timestamp()) return; - /* check for a hardlockup - * This is done by making sure our timer interrupt - * is incrementing. The timer interrupt should have - * fired multiple times before we overflow'd. If it hasn't - * then this is a good indication the cpu is stuck - */ - if (is_hardlockup()) { - int this_cpu = smp_processor_id(); - - /* only print hardlockups once */ - if (__this_cpu_read(hard_watchdog_warn) == true) - return; - - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", - this_cpu); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - - /* - * Perform all-CPU dump only once to avoid multiple hardlockups - * generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &hardlockup_allcpu_dumped)) - trigger_allbutself_cpu_backtrace(); - - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); - - __this_cpu_write(hard_watchdog_warn, true); - return; - } - - __this_cpu_write(hard_watchdog_warn, false); - return; + watchdog_hardlockup_check(smp_processor_id(), regs); } static int hardlockup_detector_event_create(void) { - unsigned int cpu = smp_processor_id(); + unsigned int cpu; struct perf_event_attr *wd_attr; struct perf_event *evt; + /* + * Preemption is not disabled because memory will be allocated. + * Ensure CPU-locality by calling this in per-CPU kthread. + */ + WARN_ON(!is_percpu_thread()); + cpu = raw_smp_processor_id(); wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); @@ -185,10 +132,14 @@ static int hardlockup_detector_event_create(void) } /** - * hardlockup_detector_perf_enable - Enable the local event + * watchdog_hardlockup_enable - Enable the local event + * + * @cpu: The CPU to enable hard lockup on. */ -void hardlockup_detector_perf_enable(void) +void watchdog_hardlockup_enable(unsigned int cpu) { + WARN_ON_ONCE(cpu != smp_processor_id()); + if (hardlockup_detector_event_create()) return; @@ -200,12 +151,16 @@ void hardlockup_detector_perf_enable(void) } /** - * hardlockup_detector_perf_disable - Disable the local event + * watchdog_hardlockup_disable - Disable the local event + * + * @cpu: The CPU to enable hard lockup on. */ -void hardlockup_detector_perf_disable(void) +void watchdog_hardlockup_disable(unsigned int cpu) { struct perf_event *event = this_cpu_read(watchdog_ev); + WARN_ON_ONCE(cpu != smp_processor_id()); + if (event) { perf_event_disable(event); this_cpu_write(watchdog_ev, NULL); @@ -268,7 +223,7 @@ void __init hardlockup_detector_perf_restart(void) lockdep_assert_cpus_held(); - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)) return; for_each_online_cpu(cpu) { @@ -279,12 +234,22 @@ void __init hardlockup_detector_perf_restart(void) } } +bool __weak __init arch_perf_nmi_is_available(void) +{ + return true; +} + /** - * hardlockup_detector_perf_init - Probe whether NMI event is available at all + * watchdog_hardlockup_probe - Probe whether NMI event is available at all */ -int __init hardlockup_detector_perf_init(void) +int __init watchdog_hardlockup_probe(void) { - int ret = hardlockup_detector_event_create(); + int ret; + + if (!arch_perf_nmi_is_available()) + return -ENODEV; + + ret = hardlockup_detector_event_create(); if (ret) { pr_info("Perf NMI watchdog permanently disabled\n"); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4666a1a92a31..02a8f402eeb5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -126,6 +126,12 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * + * K: Only modified by worker while holding pool->lock. Can be safely read by + * self, while holding pool->lock or from IRQ context if %current is the + * kworker. + * + * S: Only modified by worker self. + * * A: wq_pool_attach_mutex protected. * * PL: wq_pool_mutex protected. @@ -200,6 +206,22 @@ struct worker_pool { }; /* + * Per-pool_workqueue statistics. These can be monitored using + * tools/workqueue/wq_monitor.py. + */ +enum pool_workqueue_stats { + PWQ_STAT_STARTED, /* work items started execution */ + PWQ_STAT_COMPLETED, /* work items completed execution */ + PWQ_STAT_CPU_TIME, /* total CPU time consumed */ + PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ + PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ + PWQ_STAT_MAYDAY, /* maydays to rescuer */ + PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ + + PWQ_NR_STATS, +}; + +/* * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS * of work_struct->data are used for flags and the remaining high bits * point to the pwq; thus, pwqs need to be aligned at two's power of the @@ -236,6 +258,8 @@ struct pool_workqueue { struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ + u64 stats[PWQ_NR_STATS]; + /* * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue @@ -310,6 +334,14 @@ static struct kmem_cache *pwq_cache; static cpumask_var_t *wq_numa_possible_cpumask; /* possible CPUs of each node */ +/* + * Per-cpu work items which run for longer than the following threshold are + * automatically considered CPU intensive and excluded from concurrency + * management to prevent them from noticeably delaying other per-cpu work items. + */ +static unsigned long wq_cpu_intensive_thresh_us = 10000; +module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); + static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); @@ -705,12 +737,17 @@ static void clear_work_data(struct work_struct *work) set_work_data(work, WORK_STRUCT_NO_POOL, 0); } +static inline struct pool_workqueue *work_struct_pwq(unsigned long data) +{ + return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); +} + static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) - return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); + return work_struct_pwq(data); else return NULL; } @@ -738,8 +775,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) assert_rcu_or_pool_mutex(); if (data & WORK_STRUCT_PWQ) - return ((struct pool_workqueue *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool; + return work_struct_pwq(data)->pool; pool_id = data >> WORK_OFFQ_POOL_SHIFT; if (pool_id == WORK_OFFQ_POOL_NONE) @@ -760,8 +796,7 @@ static int get_work_pool_id(struct work_struct *work) unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) - return ((struct pool_workqueue *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; + return work_struct_pwq(data)->pool->id; return data >> WORK_OFFQ_POOL_SHIFT; } @@ -864,6 +899,152 @@ static void wake_up_worker(struct worker_pool *pool) } /** + * worker_set_flags - set worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to set + * + * Set @flags in @worker->flags and adjust nr_running accordingly. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock) + */ +static inline void worker_set_flags(struct worker *worker, unsigned int flags) +{ + struct worker_pool *pool = worker->pool; + + WARN_ON_ONCE(worker->task != current); + + /* If transitioning into NOT_RUNNING, adjust nr_running. */ + if ((flags & WORKER_NOT_RUNNING) && + !(worker->flags & WORKER_NOT_RUNNING)) { + pool->nr_running--; + } + + worker->flags |= flags; +} + +/** + * worker_clr_flags - clear worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to clear + * + * Clear @flags in @worker->flags and adjust nr_running accordingly. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock) + */ +static inline void worker_clr_flags(struct worker *worker, unsigned int flags) +{ + struct worker_pool *pool = worker->pool; + unsigned int oflags = worker->flags; + + WARN_ON_ONCE(worker->task != current); + + worker->flags &= ~flags; + + /* + * If transitioning out of NOT_RUNNING, increment nr_running. Note + * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask + * of multiple flags, not a single flag. + */ + if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) + pool->nr_running++; +} + +#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT + +/* + * Concurrency-managed per-cpu work items that hog CPU for longer than + * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, + * which prevents them from stalling other concurrency-managed work items. If a + * work function keeps triggering this mechanism, it's likely that the work item + * should be using an unbound workqueue instead. + * + * wq_cpu_intensive_report() tracks work functions which trigger such conditions + * and report them so that they can be examined and converted to use unbound + * workqueues as appropriate. To avoid flooding the console, each violating work + * function is tracked and reported with exponential backoff. + */ +#define WCI_MAX_ENTS 128 + +struct wci_ent { + work_func_t func; + atomic64_t cnt; + struct hlist_node hash_node; +}; + +static struct wci_ent wci_ents[WCI_MAX_ENTS]; +static int wci_nr_ents; +static DEFINE_RAW_SPINLOCK(wci_lock); +static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); + +static struct wci_ent *wci_find_ent(work_func_t func) +{ + struct wci_ent *ent; + + hash_for_each_possible_rcu(wci_hash, ent, hash_node, + (unsigned long)func) { + if (ent->func == func) + return ent; + } + return NULL; +} + +static void wq_cpu_intensive_report(work_func_t func) +{ + struct wci_ent *ent; + +restart: + ent = wci_find_ent(func); + if (ent) { + u64 cnt; + + /* + * Start reporting from the fourth time and back off + * exponentially. + */ + cnt = atomic64_inc_return_relaxed(&ent->cnt); + if (cnt >= 4 && is_power_of_2(cnt)) + printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", + ent->func, wq_cpu_intensive_thresh_us, + atomic64_read(&ent->cnt)); + return; + } + + /* + * @func is a new violation. Allocate a new entry for it. If wcn_ents[] + * is exhausted, something went really wrong and we probably made enough + * noise already. + */ + if (wci_nr_ents >= WCI_MAX_ENTS) + return; + + raw_spin_lock(&wci_lock); + + if (wci_nr_ents >= WCI_MAX_ENTS) { + raw_spin_unlock(&wci_lock); + return; + } + + if (wci_find_ent(func)) { + raw_spin_unlock(&wci_lock); + goto restart; + } + + ent = &wci_ents[wci_nr_ents++]; + ent->func = func; + atomic64_set(&ent->cnt, 1); + hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); + + raw_spin_unlock(&wci_lock); +} + +#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ +static void wq_cpu_intensive_report(work_func_t func) {} +#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ + +/** * wq_worker_running - a worker is running again * @task: task waking up * @@ -873,7 +1054,7 @@ void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); - if (!worker->sleeping) + if (!READ_ONCE(worker->sleeping)) return; /* @@ -886,7 +1067,14 @@ void wq_worker_running(struct task_struct *task) if (!(worker->flags & WORKER_NOT_RUNNING)) worker->pool->nr_running++; preempt_enable(); - worker->sleeping = 0; + + /* + * CPU intensive auto-detection cares about how long a work item hogged + * CPU without sleeping. Reset the starting timestamp on wakeup. + */ + worker->current_at = worker->task->se.sum_exec_runtime; + + WRITE_ONCE(worker->sleeping, 0); } /** @@ -912,10 +1100,10 @@ void wq_worker_sleeping(struct task_struct *task) pool = worker->pool; /* Return if preempted before wq_worker_running() was reached */ - if (worker->sleeping) + if (READ_ONCE(worker->sleeping)) return; - worker->sleeping = 1; + WRITE_ONCE(worker->sleeping, 1); raw_spin_lock_irq(&pool->lock); /* @@ -929,12 +1117,66 @@ void wq_worker_sleeping(struct task_struct *task) } pool->nr_running--; - if (need_more_worker(pool)) + if (need_more_worker(pool)) { + worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; wake_up_worker(pool); + } raw_spin_unlock_irq(&pool->lock); } /** + * wq_worker_tick - a scheduler tick occurred while a kworker is running + * @task: task currently running + * + * Called from scheduler_tick(). We're in the IRQ context and the current + * worker's fields which follow the 'K' locking rule can be accessed safely. + */ +void wq_worker_tick(struct task_struct *task) +{ + struct worker *worker = kthread_data(task); + struct pool_workqueue *pwq = worker->current_pwq; + struct worker_pool *pool = worker->pool; + + if (!pwq) + return; + + pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; + + if (!wq_cpu_intensive_thresh_us) + return; + + /* + * If the current worker is concurrency managed and hogged the CPU for + * longer than wq_cpu_intensive_thresh_us, it's automatically marked + * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. + * + * Set @worker->sleeping means that @worker is in the process of + * switching out voluntarily and won't be contributing to + * @pool->nr_running until it wakes up. As wq_worker_sleeping() also + * decrements ->nr_running, setting CPU_INTENSIVE here can lead to + * double decrements. The task is releasing the CPU anyway. Let's skip. + * We probably want to make this prettier in the future. + */ + if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || + worker->task->se.sum_exec_runtime - worker->current_at < + wq_cpu_intensive_thresh_us * NSEC_PER_USEC) + return; + + raw_spin_lock(&pool->lock); + + worker_set_flags(worker, WORKER_CPU_INTENSIVE); + wq_cpu_intensive_report(worker->current_func); + pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; + + if (need_more_worker(pool)) { + pwq->stats[PWQ_STAT_CM_WAKEUP]++; + wake_up_worker(pool); + } + + raw_spin_unlock(&pool->lock); +} + +/** * wq_worker_last_func - retrieve worker's last work function * @task: Task to retrieve last work function of. * @@ -966,60 +1208,6 @@ work_func_t wq_worker_last_func(struct task_struct *task) } /** - * worker_set_flags - set worker flags and adjust nr_running accordingly - * @worker: self - * @flags: flags to set - * - * Set @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) - */ -static inline void worker_set_flags(struct worker *worker, unsigned int flags) -{ - struct worker_pool *pool = worker->pool; - - WARN_ON_ONCE(worker->task != current); - - /* If transitioning into NOT_RUNNING, adjust nr_running. */ - if ((flags & WORKER_NOT_RUNNING) && - !(worker->flags & WORKER_NOT_RUNNING)) { - pool->nr_running--; - } - - worker->flags |= flags; -} - -/** - * worker_clr_flags - clear worker flags and adjust nr_running accordingly - * @worker: self - * @flags: flags to clear - * - * Clear @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) - */ -static inline void worker_clr_flags(struct worker *worker, unsigned int flags) -{ - struct worker_pool *pool = worker->pool; - unsigned int oflags = worker->flags; - - WARN_ON_ONCE(worker->task != current); - - worker->flags &= ~flags; - - /* - * If transitioning out of NOT_RUNNING, increment nr_running. Note - * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask - * of multiple flags, not a single flag. - */ - if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) - if (!(worker->flags & WORKER_NOT_RUNNING)) - pool->nr_running++; -} - -/** * find_worker_executing_work - find worker which is executing a work * @pool: pool of interest * @work: work to find worker for @@ -1539,6 +1727,8 @@ out: * We queue the work to a specific CPU, the caller must ensure it * can't go away. Callers that fail to ensure that the specified * CPU cannot go away will execute on a randomly chosen CPU. + * But note well that callers specifying a CPU that never has been + * online will get a splat. * * Return: %false if @work was already on a queue, %true otherwise. */ @@ -2163,6 +2353,7 @@ static void send_mayday(struct work_struct *work) get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); + pwq->stats[PWQ_STAT_MAYDAY]++; } } @@ -2300,7 +2491,6 @@ __acquires(&pool->lock) { struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; - bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; unsigned long work_data; struct worker *collision; #ifdef CONFIG_LOCKDEP @@ -2337,6 +2527,7 @@ __acquires(&pool->lock) worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; + worker->current_at = worker->task->se.sum_exec_runtime; work_data = *work_data_bits(work); worker->current_color = get_work_color(work_data); @@ -2354,7 +2545,7 @@ __acquires(&pool->lock) * of concurrency management and the next code block will chain * execution of the pending work items. */ - if (unlikely(cpu_intensive)) + if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* @@ -2401,6 +2592,7 @@ __acquires(&pool->lock) * workqueues), so hiding them isn't a problem. */ lockdep_invariant_state(true); + pwq->stats[PWQ_STAT_STARTED]++; trace_workqueue_execute_start(work); worker->current_func(work); /* @@ -2408,6 +2600,7 @@ __acquires(&pool->lock) * point will only record its address. */ trace_workqueue_execute_end(work, worker->current_func); + pwq->stats[PWQ_STAT_COMPLETED]++; lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); @@ -2432,9 +2625,12 @@ __acquires(&pool->lock) raw_spin_lock_irq(&pool->lock); - /* clear cpu intensive status */ - if (unlikely(cpu_intensive)) - worker_clr_flags(worker, WORKER_CPU_INTENSIVE); + /* + * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked + * CPU intensive by wq_worker_tick() if @work hogged CPU longer than + * wq_cpu_intensive_thresh_us. Clear it. + */ + worker_clr_flags(worker, WORKER_CPU_INTENSIVE); /* tag the worker for identification in schedule() */ worker->last_func = worker->current_func; @@ -2651,6 +2847,7 @@ repeat: if (first) pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); + pwq->stats[PWQ_STAT_RESCUED]++; } first = false; } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index e00b1204a8e9..6b1d66e28269 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -28,13 +28,18 @@ struct worker { struct hlist_node hentry; /* L: while busy */ }; - struct work_struct *current_work; /* L: work being processed */ - work_func_t current_func; /* L: current_work's fn */ - struct pool_workqueue *current_pwq; /* L: current_work's pwq */ - unsigned int current_color; /* L: current_work's color */ - struct list_head scheduled; /* L: scheduled works */ + struct work_struct *current_work; /* K: work being processed and its */ + work_func_t current_func; /* K: function */ + struct pool_workqueue *current_pwq; /* K: pwq */ + u64 current_at; /* K: runtime at start or last wakeup */ + unsigned int current_color; /* K: color */ + + int sleeping; /* S: is worker sleeping? */ - /* 64 bytes boundary on 64bit, 32 on 32bit */ + /* used by the scheduler to determine a worker's last known identity */ + work_func_t last_func; /* K: last work's fn */ + + struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* A: the associated pool */ @@ -42,10 +47,9 @@ struct worker { struct list_head node; /* A: anchored at pool->workers */ /* A: runs through worker->node */ - unsigned long last_active; /* L: last active timestamp */ + unsigned long last_active; /* K: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - int sleeping; /* None */ /* * Opaque string set with work_set_desc(). Printed out with task @@ -55,9 +59,6 @@ struct worker { /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ - - /* used by the scheduler to determine a worker's last known identity */ - work_func_t last_func; }; /** @@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void) */ void wq_worker_running(struct task_struct *task); void wq_worker_sleeping(struct task_struct *task); +void wq_worker_tick(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |