diff options
Diffstat (limited to 'kernel')
201 files changed, 10501 insertions, 4465 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index ce77f0265660..8c6de5a9ecc4 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -96,8 +96,9 @@ config PREEMPTION config PREEMPT_DYNAMIC bool "Preemption behaviour defined on boot" depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT + select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY select PREEMPT_BUILD - default y + default y if HAVE_PREEMPT_DYNAMIC_CALL help This option allows to define the preemption model on the kernel command line parameter and thus override the default preemption @@ -132,4 +133,14 @@ config SCHED_CORE which is the likely usage by Linux distributions, there should be no measurable impact on performance. - +config ARCH_WANTS_RT_DELAYED_SIGNALS + bool + help + This option is selected by architectures where raising signals + can happen in atomic contexts on PREEMPT_RT enabled kernels. This + option delays raising the signal until the return to user space + loop where it is also delivered. X86 requires this to deliver + signals from trap handlers which run on IST stacks. + +config RT_DELAYED_SIGNALS + def_bool PREEMPT_RT && ARCH_WANTS_RT_DELAYED_SIGNALS diff --git a/kernel/Makefile b/kernel/Makefile index 186c49582f45..56f4ee97f328 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -67,6 +67,7 @@ obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o diff --git a/kernel/async.c b/kernel/async.c index b8d7a663497f..b2c4ba5686ee 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* mark that this task has queued an async job, used by module init */ - current->flags |= PF_USED_ASYNC; - /* schedule for execution */ queue_work_node(node, system_unbound_wq, &entry->work); diff --git a/kernel/audit.c b/kernel/audit.c index e4bbe2c70c26..7690c29d4ee4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb) /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record + * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ -static void kauditd_rehold_skb(struct sk_buff *skb) +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { - /* put the record back in the queue at the same place */ - skb_queue_head(&audit_hold_queue, skb); + /* put the record back in the queue */ + skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record + * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this @@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ -static void kauditd_hold_skb(struct sk_buff *skb) +static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ - if (!audit_default) { - kfree_skb(skb); - return; + if (!audit_default) + goto drop; + + /* the hold queue is only for when the daemon goes away completely, + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the + * record on the retry queue unless it's full, in which case drop it + */ + if (error == -EAGAIN) { + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + audit_log_lost("kauditd retry queue overflow"); + goto drop; } - /* if we have room, queue the message */ + /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); @@ -585,24 +599,32 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); +drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record + * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ -static void kauditd_retry_skb(struct sk_buff *skb) +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { - /* NOTE: because records should only live in the retry queue for a - * short period of time, before either being sent or moved to the hold - * queue, we don't currently enforce a limit on this queue */ - skb_queue_tail(&audit_retry_queue, skb); + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + + /* we have to drop the record, send it via printk as a last effort */ + kauditd_printk_skb(skb); + audit_log_lost("kauditd retry queue overflow"); + kfree_skb(skb); } /** @@ -640,7 +662,7 @@ static void auditd_reset(const struct auditd_connection *ac) /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); + kauditd_hold_skb(skb, -ECONNREFUSED); } /** @@ -714,16 +736,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), - void (*err_hook)(struct sk_buff *skb)) + void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ - while ((skb = skb_dequeue(queue))) { + skb_tail = skb_peek_tail(queue); + while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); @@ -731,7 +755,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, -ECONNREFUSED); continue; } @@ -745,7 +769,7 @@ retry: rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ diff --git a/kernel/audit.h b/kernel/audit.h index c4498090a5bd..58b66543b4d5 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -201,6 +201,10 @@ struct audit_context { struct { char *name; } module; + struct { + struct audit_ntp_data ntp_data; + struct timespec64 tk_injoffset; + } time; }; int fds[2]; struct audit_proctitle proctitle; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fce5d43a933f..ea2ee1181921 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -185,7 +185,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask) case AUDITSC_EXECVE: return mask & AUDIT_PERM_EXEC; case AUDITSC_OPENAT2: - return mask & ACC_MODE((u32)((struct open_how *)ctx->argv[2])->flags); + return mask & ACC_MODE((u32)ctx->openat2.flags); default: return 0; } @@ -1340,6 +1340,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) from_kuid(&init_user_ns, name->fcap.rootid)); } +static void audit_log_time(struct audit_context *context, struct audit_buffer **ab) +{ + const struct audit_ntp_data *ntp = &context->time.ntp_data; + const struct timespec64 *tk = &context->time.tk_injoffset; + static const char * const ntp_name[] = { + "offset", + "freq", + "status", + "tai", + "tick", + "adjust", + }; + int type; + + if (context->type == AUDIT_TIME_ADJNTPVAL) { + for (type = 0; type < AUDIT_NTP_NVALS; type++) { + if (ntp->vals[type].newval != ntp->vals[type].oldval) { + if (!*ab) { + *ab = audit_log_start(context, + GFP_KERNEL, + AUDIT_TIME_ADJNTPVAL); + if (!*ab) + return; + } + audit_log_format(*ab, "op=%s old=%lli new=%lli", + ntp_name[type], + ntp->vals[type].oldval, + ntp->vals[type].newval); + audit_log_end(*ab); + *ab = NULL; + } + } + } + if (tk->tv_sec != 0 || tk->tv_nsec != 0) { + if (!*ab) { + *ab = audit_log_start(context, GFP_KERNEL, + AUDIT_TIME_INJOFFSET); + if (!*ab) + return; + } + audit_log_format(*ab, "sec=%lli nsec=%li", + (long long)tk->tv_sec, tk->tv_nsec); + audit_log_end(*ab); + *ab = NULL; + } +} + static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1454,6 +1501,11 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "(null)"); break; + case AUDIT_TIME_ADJNTPVAL: + case AUDIT_TIME_INJOFFSET: + /* this call deviates from the rest, eating the buffer */ + audit_log_time(context, &ab); + break; } audit_log_end(ab); } @@ -2849,31 +2901,26 @@ void __audit_fanotify(unsigned int response) void __audit_tk_injoffset(struct timespec64 offset) { - audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET, - "sec=%lli nsec=%li", - (long long)offset.tv_sec, offset.tv_nsec); -} - -static void audit_log_ntp_val(const struct audit_ntp_data *ad, - const char *op, enum audit_ntp_type type) -{ - const struct audit_ntp_val *val = &ad->vals[type]; - - if (val->newval == val->oldval) - return; + struct audit_context *context = audit_context(); - audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL, - "op=%s old=%lli new=%lli", op, val->oldval, val->newval); + /* only set type if not already set by NTP */ + if (!context->type) + context->type = AUDIT_TIME_INJOFFSET; + memcpy(&context->time.tk_injoffset, &offset, sizeof(offset)); } void __audit_ntp_log(const struct audit_ntp_data *ad) { - audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET); - audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ); - audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS); - audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI); - audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK); - audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); + struct audit_context *context = audit_context(); + int type; + + for (type = 0; type < AUDIT_NTP_NVALS; type++) + if (ad->vals[type].newval != ad->vals[type].oldval) { + /* unconditionally set type, overwriting TK */ + context->type = AUDIT_TIME_ADJNTPVAL; + memcpy(&context->time.ntp_data, ad, sizeof(*ad)); + break; + } } void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index d24d518ddd63..d56ee177d5f8 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -30,6 +30,7 @@ config BPF_SYSCALL select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET + select PAGE_POOL if NET default n help Enable the bpf() system call that allows to manipulate BPF programs @@ -58,6 +59,10 @@ config BPF_JIT_ALWAYS_ON Enables BPF JIT and removes BPF interpreter to avoid speculative execution of BPF instructions by the interpreter. + When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable + is permanently set to 1 and setting any other value than that will + return failure. + config BPF_JIT_DEFAULT_ON def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON depends on HAVE_EBPF_JIT && BPF_JIT diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c7a5be3bf8be..7f145aefbff8 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -837,13 +837,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog = bpf_prog_get(fd); if (IS_ERR(prog)) return prog; - if (!bpf_prog_array_compatible(array, prog)) { + if (!bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } @@ -1071,7 +1070,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); - spin_lock_init(&aux->owner.lock); map = array_map_alloc(attr); if (IS_ERR(map)) { diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e29d9e3d853e..96be8d518885 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(f->f_inode, (struct bpf_local_storage_map *)map, - value, map_flags); + value, map_flags, GFP_ATOMIC); fput(f); return PTR_ERR_OR_ZERO(sdata); } @@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) return err; } -BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; @@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b7aef5b3416d..110029ede71e 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -5,6 +5,7 @@ #include <linux/anon_inodes.h> #include <linux/filter.h> #include <linux/bpf.h> +#include <linux/rcupdate_trace.h> struct bpf_iter_target_info { struct list_head list; @@ -684,11 +685,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { int ret; - rcu_read_lock(); - migrate_disable(); - ret = bpf_prog_run(prog, ctx); - migrate_enable(); - rcu_read_unlock(); + if (prog->aux->sleepable) { + rcu_read_lock_trace(); + migrate_disable(); + might_fault(); + ret = bpf_prog_run(prog, ctx); + migrate_enable(); + rcu_read_unlock_trace(); + } else { + rcu_read_lock(); + migrate_disable(); + ret = bpf_prog_run(prog, ctx); + migrate_enable(); + rcu_read_unlock(); + } /* bpf program can only return 0 or 1: * 0 : okay diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 71de2a89869c..01aa2b51ec4d 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem) + void *value, bool charge_mem, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; selem = bpf_map_kzalloc(&smap->map, smap->elem_size, - GFP_ATOMIC | __GFP_NOWARN); + gfp_flags | __GFP_NOWARN); if (selem) { if (value) memcpy(SDATA(selem)->data, value, smap->map.value_size); @@ -136,7 +136,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, * will be done by the caller. * * Although the unlock will be done under - * rcu_read_lock(), it is more intutivie to + * rcu_read_lock(), it is more intuitive to * read if the freeing of the storage is done * after the raw_spin_unlock_bh(&local_storage->lock). * @@ -282,7 +282,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata, int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem) + struct bpf_local_storage_elem *first_selem, + gfp_t gfp_flags) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; @@ -293,7 +294,7 @@ int bpf_local_storage_alloc(void *owner, return err; storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), - GFP_ATOMIC | __GFP_NOWARN); + gfp_flags | __GFP_NOWARN); if (!storage) { err = -ENOMEM; goto uncharge; @@ -350,10 +351,10 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags) + void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; - struct bpf_local_storage_elem *selem; + struct bpf_local_storage_elem *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; @@ -365,6 +366,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, !map_value_has_spin_lock(&smap->map))) return ERR_PTR(-EINVAL); + if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) + return ERR_PTR(-EINVAL); + local_storage = rcu_dereference_check(*owner_storage(smap, owner), bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { @@ -373,11 +377,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true); + selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); - err = bpf_local_storage_alloc(owner, smap, selem); + err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { kfree(selem); mem_uncharge(smap, owner, smap->elem_size); @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } + if (gfp_flags == GFP_KERNEL) { + selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + if (!selem) + return ERR_PTR(-ENOMEM); + } + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ @@ -429,19 +439,21 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata); - if (!selem) { - err = -ENOMEM; - goto unlock_err; + if (gfp_flags != GFP_KERNEL) { + /* local_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during + * bpf_selem_unlink_storage_nolock(). + */ + selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } } /* First, link the new selem to the map */ @@ -463,6 +475,10 @@ unlock: unlock_err: raw_spin_unlock_irqrestore(&local_storage->lock, flags); + if (selem) { + mem_uncharge(smap, owner, smap->elem_size); + kfree(selem); + } return ERR_PTR(err); } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 06062370c3b8..064eccba641d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -99,6 +99,24 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = { .allowed = bpf_ima_inode_hash_allowed, }; +BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size) +{ + return ima_file_hash(file, dst, size); +} + +BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file) + +static const struct bpf_func_proto bpf_ima_file_hash_proto = { + .func = bpf_ima_file_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0], + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, + .allowed = bpf_ima_inode_hash_allowed, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -121,6 +139,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; + case BPF_FUNC_ima_file_hash: + return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } @@ -167,6 +187,7 @@ BTF_ID(func, bpf_lsm_inode_setxattr) BTF_ID(func, bpf_lsm_inode_symlink) BTF_ID(func, bpf_lsm_inode_unlink) BTF_ID(func, bpf_lsm_kernel_module_request) +BTF_ID(func, bpf_lsm_kernel_read_file) BTF_ID(func, bpf_lsm_kernfs_init_security) #ifdef CONFIG_KEYS @@ -207,7 +228,7 @@ BTF_ID(func, bpf_lsm_socket_socketpair) BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) -BTF_ID(func, bpf_lsm_task_getsecid_subj) +BTF_ID(func, bpf_lsm_current_getsecid_subj) BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 5da7bed0f5f6..6638a0ecc3d2 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( - task, (struct bpf_local_storage_map *)map, value, map_flags); + task, (struct bpf_local_storage_map *)map, value, map_flags, + GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -226,8 +227,9 @@ out: return err; } -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; @@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); unlock: bpf_task_storage_unlock(); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 33bb8ae4a804..24788ce564a0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018 Facebook */ #include <uapi/linux/btf.h> @@ -198,6 +198,21 @@ DEFINE_IDR(btf_idr); DEFINE_SPINLOCK(btf_idr_lock); +enum btf_kfunc_hook { + BTF_KFUNC_HOOK_XDP, + BTF_KFUNC_HOOK_TC, + BTF_KFUNC_HOOK_STRUCT_OPS, + BTF_KFUNC_HOOK_MAX, +}; + +enum { + BTF_KFUNC_SET_MAX_CNT = 32, +}; + +struct btf_kfunc_set_tab { + struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX]; +}; + struct btf { void *data; struct btf_type **types; @@ -212,6 +227,7 @@ struct btf { refcount_t refcnt; u32 id; struct rcu_head rcu; + struct btf_kfunc_set_tab *kfunc_set_tab; /* split BTF support */ struct btf *base_btf; @@ -403,6 +419,9 @@ static struct btf_type btf_void; static int btf_resolve(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id); +static int btf_func_check(struct btf_verifier_env *env, + const struct btf_type *t); + static bool btf_type_is_modifier(const struct btf_type *t) { /* Some of them is not strictly a C modifier @@ -506,6 +525,50 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) return -ENOENT; } +static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) +{ + struct btf *btf; + s32 ret; + int id; + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return PTR_ERR(btf); + if (!btf) + return -EINVAL; + + ret = btf_find_by_name_kind(btf, name, kind); + /* ret is never zero, since btf_find_by_name_kind returns + * positive btf_id or negative error. + */ + if (ret > 0) { + btf_get(btf); + *btf_p = btf; + return ret; + } + + /* If name is not found in vmlinux's BTF then search in module's BTFs */ + spin_lock_bh(&btf_idr_lock); + idr_for_each_entry(&btf_idr, btf, id) { + if (!btf_is_module(btf)) + continue; + /* linear search could be slow hence unlock/lock + * the IDR to avoiding holding it for too long + */ + btf_get(btf); + spin_unlock_bh(&btf_idr_lock); + ret = btf_find_by_name_kind(btf, name, kind); + if (ret > 0) { + *btf_p = btf; + return ret; + } + spin_lock_bh(&btf_idr_lock); + btf_put(btf); + } + spin_unlock_bh(&btf_idr_lock); + return ret; +} + const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id) { @@ -579,6 +642,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t) btf_type_is_struct(t) || btf_type_is_array(t) || btf_type_is_var(t) || + btf_type_is_func(t) || btf_type_is_decl_tag(t) || btf_type_is_datasec(t); } @@ -1531,8 +1595,30 @@ static void btf_free_id(struct btf *btf) spin_unlock_irqrestore(&btf_idr_lock, flags); } +static void btf_free_kfunc_set_tab(struct btf *btf) +{ + struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab; + int hook, type; + + if (!tab) + return; + /* For module BTF, we directly assign the sets being registered, so + * there is nothing to free except kfunc_set_tab. + */ + if (btf_is_module(btf)) + goto free_tab; + for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) { + for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++) + kfree(tab->sets[hook][type]); + } +free_tab: + kfree(tab); + btf->kfunc_set_tab = NULL; +} + static void btf_free(struct btf *btf) { + btf_free_kfunc_set_tab(btf); kvfree(btf->types); kvfree(btf->resolved_sizes); kvfree(btf->resolved_ids); @@ -2505,7 +2591,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, * * We now need to continue from the last-resolved-ptr to * ensure the last-resolved-ptr will not referring back to - * the currenct ptr (t). + * the current ptr (t). */ if (btf_type_is_modifier(next_type)) { const struct btf_type *resolved_type; @@ -3533,9 +3619,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return 0; } +static int btf_func_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + int err; + + err = btf_func_check(env, t); + if (err) + return err; + + env_stack_pop_resolved(env, next_type_id, 0); + return 0; +} + static struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, - .resolve = btf_df_resolve, + .resolve = btf_func_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, @@ -4156,7 +4257,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return !btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); - if (btf_type_is_decl_tag(t)) + if (btf_type_is_decl_tag(t) || btf_type_is_func(t)) return btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); @@ -4246,12 +4347,6 @@ static int btf_check_all_types(struct btf_verifier_env *env) if (err) return err; } - - if (btf_type_is_func(t)) { - err = btf_func_check(env, t); - if (err) - return err; - } } return 0; @@ -4387,8 +4482,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env) btf = env->btf; btf_data_size = btf->data_size; - if (btf_data_size < - offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { + if (btf_data_size < offsetofend(struct btf_header, hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } @@ -4848,6 +4942,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; + const char *tag_value; u32 nr_args, arg; int i, ret; @@ -5000,6 +5095,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->btf = btf; info->btf_id = t->type; t = btf_type_by_id(btf, t->type); + + if (btf_type_is_type_tag(t)) { + tag_value = __btf_name_by_offset(btf, t->name_off); + if (strcmp(tag_value, "user") == 0) + info->reg_type |= MEM_USER; + if (strcmp(tag_value, "percpu") == 0) + info->reg_type |= MEM_PERCPU; + } + /* skip modifiers */ while (btf_type_is_modifier(t)) { info->btf_id = t->type; @@ -5026,12 +5130,12 @@ enum bpf_struct_walk_result { static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, - u32 *next_btf_id) + u32 *next_btf_id, enum bpf_type_flag *flag) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; - const char *tname, *mname; + const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; again: @@ -5215,7 +5319,8 @@ error: } if (btf_type_is_ptr(mtype)) { - const struct btf_type *stype; + const struct btf_type *stype, *t; + enum bpf_type_flag tmp_flag = 0; u32 id; if (msize != size || off != moff) { @@ -5224,9 +5329,23 @@ error: mname, moff, tname, off, size); return -EACCES; } + + /* check type tag */ + t = btf_type_by_id(btf, mtype->type); + if (btf_type_is_type_tag(t)) { + tag_value = __btf_name_by_offset(btf, t->name_off); + /* check __user tag */ + if (strcmp(tag_value, "user") == 0) + tmp_flag = MEM_USER; + /* check __percpu tag */ + if (strcmp(tag_value, "percpu") == 0) + tmp_flag = MEM_PERCPU; + } + stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; + *flag = tmp_flag; return WALK_PTR; } } @@ -5253,13 +5372,14 @@ error: int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype __maybe_unused, - u32 *next_btf_id) + u32 *next_btf_id, enum bpf_type_flag *flag) { + enum bpf_type_flag tmp_flag = 0; int err; u32 id; do { - err = btf_struct_walk(log, btf, t, off, size, &id); + err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); switch (err) { case WALK_PTR: @@ -5267,6 +5387,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, * we're done. */ *next_btf_id = id; + *flag = tmp_flag; return PTR_TO_BTF_ID; case WALK_SCALAR: return SCALAR_VALUE; @@ -5311,6 +5432,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *need_btf, u32 need_type_id) { const struct btf_type *type; + enum bpf_type_flag flag; int err; /* Are we already done? */ @@ -5321,7 +5443,7 @@ again: type = btf_type_by_id(btf, id); if (!type) return false; - err = btf_struct_walk(log, btf, type, off, 1, &id); + err = btf_struct_walk(log, btf, type, off, 1, &id, &flag); if (err != WALK_STRUCT) return false; @@ -5616,17 +5738,45 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log, return true; } +static bool is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg) +{ + int len, sfx_len = sizeof("__sz") - 1; + const struct btf_type *t; + const char *param_name; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + /* In the future, this can be ported to use BTF tagging */ + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len < sfx_len) + return false; + param_name += len - sfx_len; + if (strncmp(param_name, "__sz", sfx_len)) + return false; + + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok) { struct bpf_verifier_log *log = &env->log; + u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - u32 i, nargs, ref_id; + int ref_regno = 0, ret; + bool rel = false; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { @@ -5652,6 +5802,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + /* Only kfunc can be release func */ + if (is_kfunc) + rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RELEASE, func_id); /* check that BTF function arguments match actual types that the * verifier sees. */ @@ -5675,6 +5829,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); + + ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel); + if (ret < 0) + return ret; + if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) { /* If function expects ctx type in BTF check that caller @@ -5686,9 +5845,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, i, btf_type_str(t)); return -EINVAL; } - if (check_ctx_reg(env, reg, regno)) - return -EINVAL; - } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) { + } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || + (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) { const struct btf_type *reg_ref_t; const struct btf *reg_btf; const char *reg_ref_tname; @@ -5704,9 +5862,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; + /* Ensure only one argument is referenced + * PTR_TO_BTF_ID, check_func_arg_reg_off relies + * on only one referenced register being allowed + * for kfuncs. + */ + if (reg->ref_obj_id) { + if (ref_obj_id) { + bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, ref_obj_id); + return -EFAULT; + } + ref_regno = regno; + ref_obj_id = reg->ref_obj_id; + } } else { reg_btf = btf_vmlinux; - reg_ref_id = *reg2btf_ids[reg->type]; + reg_ref_id = *reg2btf_ids[base_type(reg->type)]; } reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, @@ -5727,17 +5899,33 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, u32 type_size; if (is_kfunc) { + bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); + /* Permit pointer to mem, but only when argument * type is pointer to scalar, or struct composed * (recursively) of scalars. + * When arg_mem_size is true, the pointer can be + * void *. */ if (!btf_type_is_scalar(ref_t) && - !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) { + !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && + (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { bpf_log(log, - "arg#%d pointer type %s %s must point to scalar or struct with scalar\n", - i, btf_type_str(ref_t), ref_tname); + "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", + i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL; } + + /* Check for mem, len pair */ + if (arg_mem_size) { + if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { + bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", + i, i + 1); + return -EINVAL; + } + i++; + continue; + } } resolve_ret = btf_resolve_size(btf, ref_t, &type_size); @@ -5758,7 +5946,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, } } - return 0; + /* Either both are set, or neither */ + WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno)); + /* We already made sure ref_obj_id is set only for one argument. We do + * allow (!rel && ref_obj_id), so that passing such referenced + * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when + * is_kfunc is true. + */ + if (rel && !ref_obj_id) { + bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", + func_name); + return -EINVAL; + } + /* returns argument register number > 0 in case of reference release kfunc */ + return rel ? ref_regno : 0; } /* Compare BTF of a function with given bpf_reg_state. @@ -6004,7 +6205,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf); - /* If we encontered an error, return it. */ + /* If we encountered an error, return it. */ if (ssnprintf.show.state.status) return ssnprintf.show.state.status; @@ -6200,12 +6401,17 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } +enum { + BTF_MODULE_F_LIVE = (1 << 0), +}; + #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module { struct list_head list; struct module *module; struct btf *btf; struct bin_attribute *sysfs_attr; + int flags; }; static LIST_HEAD(btf_modules); @@ -6233,7 +6439,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, int err = 0; if (mod->btf_data_size == 0 || - (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE && + op != MODULE_STATE_GOING)) goto out; switch (op) { @@ -6248,7 +6455,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, pr_warn("failed to validate module [%s] BTF: %ld\n", mod->name, PTR_ERR(btf)); kfree(btf_mod); - err = PTR_ERR(btf); + if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + err = PTR_ERR(btf); goto out; } err = btf_alloc_id(btf); @@ -6292,6 +6500,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, } break; + case MODULE_STATE_LIVE: + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + btf_mod->flags |= BTF_MODULE_F_LIVE; + break; + } + mutex_unlock(&btf_module_mutex); + break; case MODULE_STATE_GOING: mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { @@ -6338,7 +6557,12 @@ struct module *btf_try_get_module(const struct btf *btf) if (btf_mod->btf != btf) continue; - if (try_module_get(btf_mod->module)) + /* We must only consider module whose __init routine has + * finished, hence we must check for BTF_MODULE_F_LIVE flag, + * which is set from the notifier callback for + * MODULE_STATE_LIVE. + */ + if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module)) res = btf_mod->module; break; @@ -6349,9 +6573,43 @@ struct module *btf_try_get_module(const struct btf *btf) return res; } +/* Returns struct btf corresponding to the struct module. + * This function can return NULL or ERR_PTR. + */ +static struct btf *btf_get_module_btf(const struct module *module) +{ +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; +#endif + struct btf *btf = NULL; + + if (!module) { + btf = bpf_get_btf_vmlinux(); + if (!IS_ERR_OR_NULL(btf)) + btf_get(btf); + return btf; + } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + btf_get(btf_mod->btf); + btf = btf_mod->btf; + break; + } + mutex_unlock(&btf_module_mutex); +#endif + + return btf; +} + BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags) { - struct btf *btf; + struct btf *btf = NULL; + int btf_obj_fd = 0; long ret; if (flags) @@ -6360,44 +6618,17 @@ BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int if (name_sz <= 1 || name[name_sz - 1]) return -EINVAL; - btf = bpf_get_btf_vmlinux(); - if (IS_ERR(btf)) - return PTR_ERR(btf); - - ret = btf_find_by_name_kind(btf, name, kind); - /* ret is never zero, since btf_find_by_name_kind returns - * positive btf_id or negative error. - */ - if (ret < 0) { - struct btf *mod_btf; - int id; - - /* If name is not found in vmlinux's BTF then search in module's BTFs */ - spin_lock_bh(&btf_idr_lock); - idr_for_each_entry(&btf_idr, mod_btf, id) { - if (!btf_is_module(mod_btf)) - continue; - /* linear search could be slow hence unlock/lock - * the IDR to avoiding holding it for too long - */ - btf_get(mod_btf); - spin_unlock_bh(&btf_idr_lock); - ret = btf_find_by_name_kind(mod_btf, name, kind); - if (ret > 0) { - int btf_obj_fd; - - btf_obj_fd = __btf_new_fd(mod_btf); - if (btf_obj_fd < 0) { - btf_put(mod_btf); - return btf_obj_fd; - } - return ret | (((u64)btf_obj_fd) << 32); - } - spin_lock_bh(&btf_idr_lock); - btf_put(mod_btf); + ret = bpf_find_btf_id(name, kind, &btf); + if (ret > 0 && btf_is_module(btf)) { + btf_obj_fd = __btf_new_fd(btf); + if (btf_obj_fd < 0) { + btf_put(btf); + return btf_obj_fd; } - spin_unlock_bh(&btf_idr_lock); + return ret | (((u64)btf_obj_fd) << 32); } + if (ret > 0) + btf_put(btf); return ret; } @@ -6416,58 +6647,298 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) BTF_TRACING_TYPE_xxx #undef BTF_TRACING_TYPE -/* BTF ID set registration API for modules */ +/* Kernel Function (kfunc) BTF ID set registration API */ -#ifdef CONFIG_DEBUG_INFO_BTF_MODULES +static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + enum btf_kfunc_type type, + struct btf_id_set *add_set, bool vmlinux_set) +{ + struct btf_kfunc_set_tab *tab; + struct btf_id_set *set; + u32 set_cnt; + int ret; -void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) + if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) { + ret = -EINVAL; + goto end; + } + + if (!add_set->cnt) + return 0; + + tab = btf->kfunc_set_tab; + if (!tab) { + tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN); + if (!tab) + return -ENOMEM; + btf->kfunc_set_tab = tab; + } + + set = tab->sets[hook][type]; + /* Warn when register_btf_kfunc_id_set is called twice for the same hook + * for module sets. + */ + if (WARN_ON_ONCE(set && !vmlinux_set)) { + ret = -EINVAL; + goto end; + } + + /* We don't need to allocate, concatenate, and sort module sets, because + * only one is allowed per hook. Hence, we can directly assign the + * pointer and return. + */ + if (!vmlinux_set) { + tab->sets[hook][type] = add_set; + return 0; + } + + /* In case of vmlinux sets, there may be more than one set being + * registered per hook. To create a unified set, we allocate a new set + * and concatenate all individual sets being registered. While each set + * is individually sorted, they may become unsorted when concatenated, + * hence re-sorting the final set again is required to make binary + * searching the set using btf_id_set_contains function work. + */ + set_cnt = set ? set->cnt : 0; + + if (set_cnt > U32_MAX - add_set->cnt) { + ret = -EOVERFLOW; + goto end; + } + + if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) { + ret = -E2BIG; + goto end; + } + + /* Grow set */ + set = krealloc(tab->sets[hook][type], + offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]), + GFP_KERNEL | __GFP_NOWARN); + if (!set) { + ret = -ENOMEM; + goto end; + } + + /* For newly allocated set, initialize set->cnt to 0 */ + if (!tab->sets[hook][type]) + set->cnt = 0; + tab->sets[hook][type] = set; + + /* Concatenate the two sets */ + memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0])); + set->cnt += add_set->cnt; + + sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL); + + return 0; +end: + btf_free_kfunc_set_tab(btf); + return ret; +} + +static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + const struct btf_kfunc_id_set *kset) { - mutex_lock(&l->mutex); - list_add(&s->list, &l->list); - mutex_unlock(&l->mutex); + bool vmlinux_set = !btf_is_module(btf); + int type, ret = 0; + + for (type = 0; type < ARRAY_SIZE(kset->sets); type++) { + if (!kset->sets[type]) + continue; + + ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set); + if (ret) + break; + } + return ret; } -EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set); -void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) +static bool __btf_kfunc_id_set_contains(const struct btf *btf, + enum btf_kfunc_hook hook, + enum btf_kfunc_type type, + u32 kfunc_btf_id) { - mutex_lock(&l->mutex); - list_del_init(&s->list); - mutex_unlock(&l->mutex); + struct btf_id_set *set; + + if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) + return false; + if (!btf->kfunc_set_tab) + return false; + set = btf->kfunc_set_tab->sets[hook][type]; + if (!set) + return false; + return btf_id_set_contains(set, kfunc_btf_id); } -EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set); -bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, - struct module *owner) +static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) { - struct kfunc_btf_id_set *s; + switch (prog_type) { + case BPF_PROG_TYPE_XDP: + return BTF_KFUNC_HOOK_XDP; + case BPF_PROG_TYPE_SCHED_CLS: + return BTF_KFUNC_HOOK_TC; + case BPF_PROG_TYPE_STRUCT_OPS: + return BTF_KFUNC_HOOK_STRUCT_OPS; + default: + return BTF_KFUNC_HOOK_MAX; + } +} - mutex_lock(&klist->mutex); - list_for_each_entry(s, &klist->list, list) { - if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) { - mutex_unlock(&klist->mutex); - return true; +/* Caution: + * Reference to the module (obtained using btf_try_get_module) corresponding to + * the struct btf *MUST* be held when calling this function from verifier + * context. This is usually true as we stash references in prog's kfunc_btf_tab; + * keeping the reference for the duration of the call provides the necessary + * protection for looking up a well-formed btf->kfunc_set_tab. + */ +bool btf_kfunc_id_set_contains(const struct btf *btf, + enum bpf_prog_type prog_type, + enum btf_kfunc_type type, u32 kfunc_btf_id) +{ + enum btf_kfunc_hook hook; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id); +} + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *kset) +{ + enum btf_kfunc_hook hook; + struct btf *btf; + int ret; + + btf = btf_get_module_btf(kset->owner); + if (!btf) { + if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + pr_err("missing vmlinux BTF, cannot register kfuncs\n"); + return -ENOENT; + } + if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { + pr_err("missing module BTF, cannot register kfuncs\n"); + return -ENOENT; } + return 0; } - mutex_unlock(&klist->mutex); - return false; + if (IS_ERR(btf)) + return PTR_ERR(btf); + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + ret = btf_populate_kfunc_set(btf, hook, kset); + btf_put(btf); + return ret; } +EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); + +#define MAX_TYPES_ARE_COMPAT_DEPTH 2 -#define DEFINE_KFUNC_BTF_ID_LIST(name) \ - struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \ - __MUTEX_INITIALIZER(name.mutex) }; \ - EXPORT_SYMBOL_GPL(name) +static +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, + int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ -DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); -DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; -#endif +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); + targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + btf_type_skip_modifiers(local_btf, local_p->type, &local_id); + btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, + targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + btf_type_skip_modifiers(local_btf, local_type->type, &local_id); + btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + return 0; + } +} + +/* Check local and target types for compatibility. This check is used for + * type-based CO-RE relocations and follow slightly different rules than + * field-based relocations. This function assumes that root types were already + * checked for name match. Beyond that initial root-level name check, names + * are completely ignored. Compatibility rules are as follows: + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * kind should match for local and target types (i.e., STRUCT is not + * compatible with UNION); + * - for ENUMs, the size is ignored; + * - for INT, size and signedness are ignored; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - CONST/VOLATILE/RESTRICT modifiers are ignored; + * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; + * - FUNC_PROTOs are compatible if they have compatible signature: same + * number of input args and compatible return and argument types. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - return -EOPNOTSUPP; + return __bpf_core_types_are_compat(local_btf, local_id, + targ_btf, targ_id, + MAX_TYPES_ARE_COMPAT_DEPTH); } static bool bpf_core_is_flavor_sep(const char *s) @@ -6710,6 +7181,8 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) main_btf = bpf_get_btf_vmlinux(); if (IS_ERR(main_btf)) return ERR_CAST(main_btf); + if (!main_btf) + return ERR_PTR(-EINVAL); local_type = btf_type_by_id(local_btf, local_type_id); if (!local_type) @@ -6788,6 +7261,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, { bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL; struct bpf_core_cand_list cands = {}; + struct bpf_core_relo_res targ_res; struct bpf_core_spec *specs; int err; @@ -6827,13 +7301,19 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, cands.len = cc->cnt; /* cand_cache_mutex needs to span the cache lookup and * copy of btf pointer into bpf_core_cand_list, - * since module can be unloaded while bpf_core_apply_relo_insn + * since module can be unloaded while bpf_core_calc_relo_insn * is working with module's btf. */ } - err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8, - relo, relo_idx, ctx->btf, &cands, specs); + err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs, + &targ_res); + if (err) + goto out; + + err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx, + &targ_res); + out: kfree(specs); if (need_cands) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 514b4681a90a..128028efda64 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1031,7 +1031,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. @@ -1044,7 +1044,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr * NET_XMIT_CN (2) - continue with packet output and notify TCP * to call cwr - * -EPERM - drop packet + * -err - drop packet * * For ingress packets, this function will return -EPERM if any * attached program was found and if it returned != 1 during execution. @@ -1079,8 +1079,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); } else { ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, - __bpf_prog_run_save_cb); - ret = (ret == 1 ? 0 : -EPERM); + __bpf_prog_run_save_cb, 0); + if (ret && !IS_ERR_VALUE((long)ret)) + ret = -EFAULT; } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); @@ -1093,7 +1094,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); /** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * socket is passed is expected to be of type INET or INET6. * @@ -1107,10 +1108,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run); - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, + bpf_prog_run, 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1119,7 +1119,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * provided by user sockaddr * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user - * @type: The type of program to be exectuted + * @type: The type of program to be executed * @t_ctx: Pointer to attach type specific context * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). @@ -1142,7 +1142,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, }; struct sockaddr_storage unspec; struct cgroup *cgrp; - int ret; /* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX). @@ -1156,10 +1155,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, flags); - - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run, 0, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1169,7 +1166,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * @@ -1184,11 +1181,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run); - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, + bpf_prog_run, 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1201,17 +1196,47 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, .major = major, .minor = minor, }; - int allow; + int ret; rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run, 0); rcu_read_unlock(); - return !allow; + return ret; +} + +BPF_CALL_0(bpf_get_retval) +{ + struct bpf_cg_run_ctx *ctx = + container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + + return ctx->retval; +} + +static const struct bpf_func_proto bpf_get_retval_proto = { + .func = bpf_get_retval, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_1(bpf_set_retval, int, retval) +{ + struct bpf_cg_run_ctx *ctx = + container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + + ctx->retval = retval; + return 0; } +static const struct bpf_func_proto bpf_set_retval_proto = { + .func = bpf_set_retval, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1224,6 +1249,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; + case BPF_FUNC_get_retval: + return &bpf_get_retval_proto; + case BPF_FUNC_set_retval: + return &bpf_set_retval_proto; default: return bpf_base_func_proto(func_id); } @@ -1337,7 +1366,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run, 0); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1350,24 +1380,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.new_val); } - return ret == 1 ? 0 : -EPERM; + return ret; } #ifdef CONFIG_NET -static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum cgroup_bpf_attach_type attach_type) -{ - struct bpf_prog_array *prog_array; - bool empty; - - rcu_read_lock(); - prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); - empty = bpf_prog_array_is_empty(prog_array); - rcu_read_unlock(); - - return empty; -} - static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, struct bpf_sockopt_buf *buf) { @@ -1426,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, }; int ret, max_optlen; - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT)) - return 0; - /* Allocate a bit more than the initial user buffer for * BPF program. The canonical use case is overriding * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1452,13 +1460,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], - &ctx, bpf_prog_run); + &ctx, bpf_prog_run, 0); release_sock(sk); - if (!ret) { - ret = -EPERM; + if (ret) goto out; - } if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ @@ -1518,19 +1524,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, .sk = sk, .level = level, .optname = optname, - .retval = retval, + .current_task = current, }; int ret; - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT)) - return retval; - ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1562,27 +1560,17 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + &ctx, bpf_prog_run, retval); release_sock(sk); - if (!ret) { - ret = -EPERM; + if (ret < 0) goto out; - } if (ctx.optlen > max_optlen || ctx.optlen < 0) { ret = -EFAULT; goto out; } - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) { - ret = -EFAULT; - goto out; - } - if (ctx.optlen != 0) { if (copy_to_user(optval, ctx.optval, ctx.optlen) || put_user(ctx.optlen, optlen)) { @@ -1591,8 +1579,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } } - ret = ctx.retval; - out: sockopt_free_buf(&ctx, &buf); return ret; @@ -1607,10 +1593,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, .sk = sk, .level = level, .optname = optname, - .retval = retval, .optlen = *optlen, .optval = optval, .optval_end = optval + *optlen, + .current_task = current, }; int ret; @@ -1623,25 +1609,19 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, */ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); - if (!ret) - return -EPERM; + &ctx, bpf_prog_run, retval); + if (ret < 0) + return ret; if (ctx.optlen > *optlen) return -EFAULT; - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) - return -EFAULT; - /* BPF programs can shrink the buffer, export the modifications. */ if (ctx.optlen != 0) *optlen = ctx.optlen; - return ctx.retval; + return ret; } #endif @@ -2057,10 +2037,39 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); break; case offsetof(struct bpf_sockopt, retval): - if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); - else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); + + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sockopt_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), + treg, si->dst_reg, + offsetof(struct bpf_sockopt_kern, current_task)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), + treg, treg, + offsetof(struct task_struct, bpf_ctx)); + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + treg, si->src_reg, + offsetof(struct bpf_cg_run_ctx, retval)); + *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sockopt_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sockopt_kern, current_task)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), + si->dst_reg, si->dst_reg, + offsetof(struct task_struct, bpf_ctx)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + si->dst_reg, si->dst_reg, + offsetof(struct bpf_cg_run_ctx, retval)); + } break; case offsetof(struct bpf_sockopt, optval): *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index de3e5bc6781f..13e9dbeeedf3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -33,6 +33,7 @@ #include <linux/extable.h> #include <linux/log2.h> #include <linux/bpf_verifier.h> +#include <linux/nodemask.h> #include <asm/barrier.h> #include <asm/unaligned.h> @@ -105,6 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); + fp->blinding_requested = bpf_jit_blinding_enabled(fp); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -537,13 +539,10 @@ long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) { - const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); - unsigned long addr = (unsigned long)hdr; - WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; - prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; } static void @@ -808,6 +807,176 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, return slot; } +/* + * BPF program pack allocator. + * + * Most BPF programs are pretty small. Allocating a hole page for each + * program is sometime a waste. Many small bpf program also adds pressure + * to instruction TLB. To solve this issue, we introduce a BPF program pack + * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) + * to host BPF programs. + */ +#define BPF_PROG_CHUNK_SHIFT 6 +#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) +#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) + +struct bpf_prog_pack { + struct list_head list; + void *ptr; + unsigned long bitmap[]; +}; + +#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) + +static size_t bpf_prog_pack_size = -1; +static size_t bpf_prog_pack_mask = -1; + +static int bpf_prog_chunk_count(void) +{ + WARN_ON_ONCE(bpf_prog_pack_size == -1); + return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE; +} + +static DEFINE_MUTEX(pack_mutex); +static LIST_HEAD(pack_list); + +/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with + * CONFIG_MMU=n. Use PAGE_SIZE in these cases. + */ +#ifdef PMD_SIZE +#define BPF_HPAGE_SIZE PMD_SIZE +#define BPF_HPAGE_MASK PMD_MASK +#else +#define BPF_HPAGE_SIZE PAGE_SIZE +#define BPF_HPAGE_MASK PAGE_MASK +#endif + +static size_t select_bpf_prog_pack_size(void) +{ + size_t size; + void *ptr; + + size = BPF_HPAGE_SIZE * num_online_nodes(); + ptr = module_alloc(size); + + /* Test whether we can get huge pages. If not just use PAGE_SIZE + * packs. + */ + if (!ptr || !is_vm_area_hugepages(ptr)) { + size = PAGE_SIZE; + bpf_prog_pack_mask = PAGE_MASK; + } else { + bpf_prog_pack_mask = BPF_HPAGE_MASK; + } + + vfree(ptr); + return size; +} + +static struct bpf_prog_pack *alloc_new_pack(void) +{ + struct bpf_prog_pack *pack; + + pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())), + GFP_KERNEL); + if (!pack) + return NULL; + pack->ptr = module_alloc(bpf_prog_pack_size); + if (!pack->ptr) { + kfree(pack); + return NULL; + } + bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE); + list_add_tail(&pack->list, &pack_list); + + set_vm_flush_reset_perms(pack->ptr); + set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); + set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); + return pack; +} + +static void *bpf_prog_pack_alloc(u32 size) +{ + unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); + struct bpf_prog_pack *pack; + unsigned long pos; + void *ptr = NULL; + + mutex_lock(&pack_mutex); + if (bpf_prog_pack_size == -1) + bpf_prog_pack_size = select_bpf_prog_pack_size(); + + if (size > bpf_prog_pack_size) { + size = round_up(size, PAGE_SIZE); + ptr = module_alloc(size); + if (ptr) { + set_vm_flush_reset_perms(ptr); + set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); + set_memory_x((unsigned long)ptr, size / PAGE_SIZE); + } + goto out; + } + list_for_each_entry(pack, &pack_list, list) { + pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, + nbits, 0); + if (pos < bpf_prog_chunk_count()) + goto found_free_area; + } + + pack = alloc_new_pack(); + if (!pack) + goto out; + + pos = 0; + +found_free_area: + bitmap_set(pack->bitmap, pos, nbits); + ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT); + +out: + mutex_unlock(&pack_mutex); + return ptr; +} + +static void bpf_prog_pack_free(struct bpf_binary_header *hdr) +{ + struct bpf_prog_pack *pack = NULL, *tmp; + unsigned int nbits; + unsigned long pos; + void *pack_ptr; + + mutex_lock(&pack_mutex); + if (hdr->size > bpf_prog_pack_size) { + module_memfree(hdr); + goto out; + } + + pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask); + + list_for_each_entry(tmp, &pack_list, list) { + if (tmp->ptr == pack_ptr) { + pack = tmp; + break; + } + } + + if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) + goto out; + + nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); + pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; + + bitmap_clear(pack->bitmap, pos, nbits); + if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, + bpf_prog_chunk_count(), 0) == 0) { + list_del(&pack->list); + module_memfree(pack->ptr); + kfree(pack); + } +out: + mutex_unlock(&pack_mutex); +} + static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, @@ -833,12 +1002,11 @@ static int __init bpf_jit_charge_init(void) } pure_initcall(bpf_jit_charge_init); -int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(pages, &bpf_jit_current) > - (bpf_jit_limit >> PAGE_SHIFT)) { + if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { if (!bpf_capable()) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } @@ -846,9 +1014,9 @@ int bpf_jit_charge_modmem(u32 pages) return 0; } -void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 size) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) @@ -867,7 +1035,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - u32 size, hole, start, pages; + u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); @@ -877,20 +1045,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); - pages = size / PAGE_SIZE; - if (bpf_jit_charge_modmem(pages)) + if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = pages; + hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -903,10 +1070,117 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - u32 pages = hdr->pages; + u32 size = hdr->size; bpf_jit_free_exec(hdr); - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); +} + +/* Allocate jit binary from bpf_prog_pack allocator. + * Since the allocated memory is RO+X, the JIT engine cannot write directly + * to the memory. To solve this problem, a RW buffer is also allocated at + * as the same time. The JIT engine should calculate offsets based on the + * RO memory address, but write JITed program to the RW buffer. Once the + * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies + * the JITed program to the RO memory. + */ +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + struct bpf_binary_header **rw_header, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *ro_header; + u32 size, hole, start; + + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + + /* add 16 bytes for a random section of illegal instructions */ + size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); + + if (bpf_jit_charge_modmem(size)) + return NULL; + ro_header = bpf_prog_pack_alloc(size); + if (!ro_header) { + bpf_jit_uncharge_modmem(size); + return NULL; + } + + *rw_header = kvmalloc(size, GFP_KERNEL); + if (!*rw_header) { + bpf_arch_text_copy(&ro_header->size, &size, sizeof(size)); + bpf_prog_pack_free(ro_header); + bpf_jit_uncharge_modmem(size); + return NULL; + } + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(*rw_header, size); + (*rw_header)->size = size; + + hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), + BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); + start = (get_random_int() % hole) & ~(alignment - 1); + + *image_ptr = &ro_header->image[start]; + *rw_image = &(*rw_header)->image[start]; + + return ro_header; +} + +/* Copy JITed text from rw_header to its final location, the ro_header. */ +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + void *ptr; + + ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); + + kvfree(rw_header); + + if (IS_ERR(ptr)) { + bpf_prog_pack_free(ro_header); + return PTR_ERR(ptr); + } + prog->aux->use_bpf_prog_pack = true; + return 0; +} + +/* bpf_jit_binary_pack_free is called in two different scenarios: + * 1) when the program is freed after; + * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). + * For case 2), we need to free both the RO memory and the RW buffer. + * + * bpf_jit_binary_pack_free requires proper ro_header->size. However, + * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size + * must be set with either bpf_jit_binary_pack_finalize (normal path) or + * bpf_arch_text_copy (when jit fails). + */ +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + u32 size = ro_header->size; + + bpf_prog_pack_free(ro_header); + kvfree(rw_header); + bpf_jit_uncharge_modmem(size); +} + +static inline struct bpf_binary_header * +bpf_jit_binary_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr; + + if (fp->aux->use_bpf_prog_pack) + addr = real_start & BPF_PROG_CHUNK_MASK; + else + addr = real_start & PAGE_MASK; + + return (void *)addr; } /* This symbol is only overridden by archs that have different @@ -918,7 +1192,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_free(hdr); + if (fp->aux->use_bpf_prog_pack) + bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */); + else + bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } @@ -1146,7 +1423,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled(prog) || prog->blinded) + if (!prog->blinding_requested || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); @@ -1829,28 +2106,30 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, } #endif -bool bpf_prog_array_compatible(struct bpf_array *array, - const struct bpf_prog *fp) +bool bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { bool ret; if (fp->kprobe_override) return false; - spin_lock(&array->aux->owner.lock); - - if (!array->aux->owner.type) { + spin_lock(&map->owner.lock); + if (!map->owner.type) { /* There's no owner yet where we could check for * compatibility. */ - array->aux->owner.type = fp->type; - array->aux->owner.jited = fp->jited; + map->owner.type = fp->type; + map->owner.jited = fp->jited; + map->owner.xdp_has_frags = fp->aux->xdp_has_frags; ret = true; } else { - ret = array->aux->owner.type == fp->type && - array->aux->owner.jited == fp->jited; + ret = map->owner.type == fp->type && + map->owner.jited == fp->jited && + map->owner.xdp_has_frags == fp->aux->xdp_has_frags; } - spin_unlock(&array->aux->owner.lock); + spin_unlock(&map->owner.lock); + return ret; } @@ -1862,13 +2141,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; - struct bpf_array *array; - if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + if (!map_type_contains_progs(map)) continue; - array = container_of(map, struct bpf_array, map); - if (!bpf_prog_array_compatible(array, fp)) { + if (!bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } @@ -1968,18 +2245,10 @@ static struct bpf_prog_dummy { }, }; -/* to avoid allocating empty bpf_prog_array for cgroups that - * don't have bpf program attached use one global 'empty_prog_array' - * It will not be modified the caller of bpf_prog_array_alloc() - * (since caller requested prog_cnt == 0) - * that pointer should be 'freed' by bpf_prog_array_free() - */ -static struct { - struct bpf_prog_array hdr; - struct bpf_prog *null_prog; -} empty_prog_array = { +struct bpf_empty_prog_array bpf_empty_prog_array = { .null_prog = NULL, }; +EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { @@ -1989,12 +2258,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) (prog_cnt + 1), flags); - return &empty_prog_array.hdr; + return &bpf_empty_prog_array.hdr; } void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || progs == &empty_prog_array.hdr) + if (!progs || progs == &bpf_empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } @@ -2453,6 +2722,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return -ENOTSUPP; } +void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + return ERR_PTR(-ENOTSUPP); +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b3e6b9422238..650e5d21f90d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -397,7 +397,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, + struct bpf_map *map, int fd) { struct bpf_prog *prog; @@ -405,7 +406,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + if (prog->expected_attach_type != BPF_XDP_CPUMAP || + !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return -EINVAL; } @@ -457,7 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; - if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; /* Setup kthread */ diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index fe019dbdb3f0..038f6d7a83e4 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -858,7 +858,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) goto err_put_dev; - if (prog->expected_attach_type != BPF_XDP_DEVMAP) + if (prog->expected_attach_type != BPF_XDP_DEVMAP || + !bpf_prog_map_compatible(&dtab->map, prog)) goto err_put_prog; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d29af9988f37..65877967f414 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1636,7 +1636,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, value_size = size * num_possible_cpus(); total = 0; /* while experimenting with hash tables with sizes ranging from 10 to - * 1000, it was observed that a bucket can have upto 5 entries. + * 1000, it was observed that a bucket can have up to 5 entries. */ bucket_size = 5; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 01cfdf40c838..315053ef6a75 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2,6 +2,7 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/bpf-cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> @@ -16,6 +17,7 @@ #include <linux/pid_namespace.h> #include <linux/proc_ns.h> #include <linux/security.h> +#include <linux/btf_ids.h> #include "../../lib/kstrtox.h" @@ -223,13 +225,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) if (unlikely(!task)) goto err_clear; - strncpy(buf, task->comm, size); - - /* Verifier guarantees that size > 0. For task->comm exceeding - * size, guarantee that buf is %NUL-terminated. Unconditionally - * done here to save the size test. - */ - buf[size - 1] = 0; + /* Verifier guarantees that size > 0 */ + strscpy(buf, task->comm, size); return 0; err_clear: memset(buf, 0, size); @@ -671,6 +668,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, + const void __user *, user_ptr, struct task_struct *, tsk, u64, flags) +{ + int ret; + + /* flags is not used yet */ + if (unlikely(flags)) + return -EINVAL; + + if (unlikely(!size)) + return 0; + + ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0); + if (ret == size) + return 0; + + memset(dst, 0, size); + /* Return -EFAULT for partial read */ + return ret < 0 ? ret : -EFAULT; +} + +const struct bpf_func_proto bpf_copy_from_user_task_proto = { + .func = bpf_copy_from_user_task, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_BTF_ID, + .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg5_type = ARG_ANYTHING +}; + BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) { if (cpu >= nr_cpu_ids) @@ -1058,7 +1088,7 @@ struct bpf_hrtimer { struct bpf_timer_kern { struct bpf_hrtimer *timer; /* bpf_spin_lock is used here instead of spinlock_t to make - * sure that it always fits into space resereved by struct bpf_timer + * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. */ struct bpf_spin_lock lock; @@ -1075,6 +1105,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) void *key; u32 idx; + BTF_TYPE_EMIT(struct bpf_timer); callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 80da1db47c68..4f841e16779e 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -648,12 +648,22 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) int opt; opt = fs_parse(fc, bpf_fs_parameters, param, &result); - if (opt < 0) + if (opt < 0) { /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ - return opt == -ENOPARAM ? 0 : opt; + if (opt == -ENOPARAM) { + opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return opt; + + return 0; + } + + if (opt < 0) + return opt; + } switch (opt) { case OPT_MODE: @@ -700,11 +710,10 @@ static DEFINE_MUTEX(bpf_preload_lock); static int populate_bpffs(struct dentry *parent) { struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; - struct bpf_link *links[BPF_PRELOAD_LINKS] = {}; int err = 0, i; /* grab the mutex to make sure the kernel interactions with bpf_preload - * UMD are serialized + * are serialized */ mutex_lock(&bpf_preload_lock); @@ -712,40 +721,22 @@ static int populate_bpffs(struct dentry *parent) if (!bpf_preload_mod_get()) goto out; - if (!bpf_preload_ops->info.tgid) { - /* preload() will start UMD that will load BPF iterator programs */ - err = bpf_preload_ops->preload(objs); - if (err) + err = bpf_preload_ops->preload(objs); + if (err) + goto out_put; + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + bpf_link_inc(objs[i].link); + err = bpf_iter_link_pin_kernel(parent, + objs[i].link_name, objs[i].link); + if (err) { + bpf_link_put(objs[i].link); goto out_put; - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - links[i] = bpf_link_by_id(objs[i].link_id); - if (IS_ERR(links[i])) { - err = PTR_ERR(links[i]); - goto out_put; - } - } - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - err = bpf_iter_link_pin_kernel(parent, - objs[i].link_name, links[i]); - if (err) - goto out_put; - /* do not unlink successfully pinned links even - * if later link fails to pin - */ - links[i] = NULL; } - /* finish() will tell UMD process to exit */ - err = bpf_preload_ops->finish(); - if (err) - goto out_put; } out_put: bpf_preload_mod_put(); out: mutex_unlock(&bpf_preload_lock); - for (i = 0; i < BPF_PRELOAD_LINKS && err; i++) - if (!IS_ERR_OR_NULL(links[i])) - bpf_link_put(links[i]); return err; } diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 23f7f9d08a62..497916060ac7 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -1,4 +1,4 @@ -//SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0 #include <linux/bpf-cgroup.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index 26bced262473..c9d45c9d6918 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -18,10 +18,9 @@ menuconfig BPF_PRELOAD if BPF_PRELOAD config BPF_PRELOAD_UMD - tristate "bpf_preload kernel module with user mode driver" - depends on CC_CAN_LINK - depends on m || CC_CAN_LINK_STATIC + tristate "bpf_preload kernel module" default m help - This builds bpf_preload kernel module with embedded user mode driver. + This builds bpf_preload kernel module with embedded BPF programs for + introspection in bpffs. endif diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile index 1400ac58178e..20f89cc0a0a6 100644 --- a/kernel/bpf/preload/Makefile +++ b/kernel/bpf/preload/Makefile @@ -1,42 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ -LIBBPF_OUT = $(abspath $(obj))/libbpf -LIBBPF_A = $(LIBBPF_OUT)/libbpf.a -LIBBPF_DESTDIR = $(LIBBPF_OUT) -LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include - -# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test -# in tools/scripts/Makefile.include always succeeds when building the kernel -# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg". -$(LIBBPF_A): | $(LIBBPF_OUT) - $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \ - DESTDIR=$(LIBBPF_DESTDIR) prefix= \ - $(LIBBPF_OUT)/libbpf.a install_headers - -libbpf_hdrs: $(LIBBPF_A) - -.PHONY: libbpf_hdrs - -$(LIBBPF_OUT): - $(call msg,MKDIR,$@) - $(Q)mkdir -p $@ - -userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ - -I $(LIBBPF_INCLUDE) -Wno-unused-result - -userprogs := bpf_preload_umd - -clean-files := libbpf/ - -$(obj)/iterators/iterators.o: | libbpf_hdrs - -bpf_preload_umd-objs := iterators/iterators.o -bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz - -$(obj)/bpf_preload_umd: $(LIBBPF_A) - -$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd +LIBBPF_INCLUDE = $(srctree)/tools/lib obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o -bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o +CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE) +bpf_preload-objs += bpf_preload_kern.o diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h index 2f9932276f2e..f065c91213a0 100644 --- a/kernel/bpf/preload/bpf_preload.h +++ b/kernel/bpf/preload/bpf_preload.h @@ -2,13 +2,13 @@ #ifndef _BPF_PRELOAD_H #define _BPF_PRELOAD_H -#include <linux/usermode_driver.h> -#include "iterators/bpf_preload_common.h" +struct bpf_preload_info { + char link_name[16]; + struct bpf_link *link; +}; struct bpf_preload_ops { - struct umd_info info; int (*preload)(struct bpf_preload_info *); - int (*finish)(void); struct module *owner; }; extern struct bpf_preload_ops *bpf_preload_ops; diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 53736e52c1df..5106b5372f0c 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -2,101 +2,87 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <linux/module.h> -#include <linux/pid.h> -#include <linux/fs.h> -#include <linux/sched/signal.h> #include "bpf_preload.h" +#include "iterators/iterators.lskel.h" -extern char bpf_preload_umd_start; -extern char bpf_preload_umd_end; +static struct bpf_link *maps_link, *progs_link; +static struct iterators_bpf *skel; -static int preload(struct bpf_preload_info *obj); -static int finish(void); +static void free_links_and_skel(void) +{ + if (!IS_ERR_OR_NULL(maps_link)) + bpf_link_put(maps_link); + if (!IS_ERR_OR_NULL(progs_link)) + bpf_link_put(progs_link); + iterators_bpf__destroy(skel); +} + +static int preload(struct bpf_preload_info *obj) +{ + strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name)); + obj[0].link = maps_link; + strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name)); + obj[1].link = progs_link; + return 0; +} -static struct bpf_preload_ops umd_ops = { - .info.driver_name = "bpf_preload", +static struct bpf_preload_ops ops = { .preload = preload, - .finish = finish, .owner = THIS_MODULE, }; -static int preload(struct bpf_preload_info *obj) +static int load_skel(void) { - int magic = BPF_PRELOAD_START; - loff_t pos = 0; - int i, err; - ssize_t n; + int err; - err = fork_usermode_driver(&umd_ops.info); + skel = iterators_bpf__open(); + if (!skel) + return -ENOMEM; + err = iterators_bpf__load(skel); if (err) - return err; - - /* send the start magic to let UMD proceed with loading BPF progs */ - n = kernel_write(umd_ops.info.pipe_to_umh, - &magic, sizeof(magic), &pos); - if (n != sizeof(magic)) - return -EPIPE; - - /* receive bpf_link IDs and names from UMD */ - pos = 0; - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - n = kernel_read(umd_ops.info.pipe_from_umh, - &obj[i], sizeof(*obj), &pos); - if (n != sizeof(*obj)) - return -EPIPE; + goto out; + err = iterators_bpf__attach(skel); + if (err) + goto out; + maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd); + if (IS_ERR(maps_link)) { + err = PTR_ERR(maps_link); + goto out; } - return 0; -} - -static int finish(void) -{ - int magic = BPF_PRELOAD_END; - struct pid *tgid; - loff_t pos = 0; - ssize_t n; - - /* send the last magic to UMD. It will do a normal exit. */ - n = kernel_write(umd_ops.info.pipe_to_umh, - &magic, sizeof(magic), &pos); - if (n != sizeof(magic)) - return -EPIPE; - - tgid = umd_ops.info.tgid; - if (tgid) { - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(&umd_ops.info); + progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd); + if (IS_ERR(progs_link)) { + err = PTR_ERR(progs_link); + goto out; } + /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out + * makes skel_closenz() a no-op later in iterators_bpf__destroy(). + */ + close_fd(skel->links.dump_bpf_map_fd); + skel->links.dump_bpf_map_fd = 0; + close_fd(skel->links.dump_bpf_prog_fd); + skel->links.dump_bpf_prog_fd = 0; return 0; +out: + free_links_and_skel(); + return err; } -static int __init load_umd(void) +static int __init load(void) { int err; - err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start, - &bpf_preload_umd_end - &bpf_preload_umd_start); + err = load_skel(); if (err) return err; - bpf_preload_ops = &umd_ops; + bpf_preload_ops = &ops; return err; } -static void __exit fini_umd(void) +static void __exit fini(void) { - struct pid *tgid; - bpf_preload_ops = NULL; - - /* kill UMD in case it's still there due to earlier error */ - tgid = umd_ops.info.tgid; - if (tgid) { - kill_pid(tgid, SIGKILL, 1); - - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(&umd_ops.info); - } - umd_unload_blob(&umd_ops.info); + free_links_and_skel(); } -late_initcall(load_umd); -module_exit(fini_umd); +late_initcall(load); +module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S deleted file mode 100644 index f1f40223b5c3..000000000000 --- a/kernel/bpf/preload/bpf_preload_umd_blob.S +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - .section .init.rodata, "a" - .global bpf_preload_umd_start -bpf_preload_umd_start: - .incbin "kernel/bpf/preload/bpf_preload_umd" - .global bpf_preload_umd_end -bpf_preload_umd_end: diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile index b8bd60511227..bfe24f8c5a20 100644 --- a/kernel/bpf/preload/iterators/Makefile +++ b/kernel/bpf/preload/iterators/Makefile @@ -35,15 +35,15 @@ endif .PHONY: all clean -all: iterators.skel.h +all: iterators.lskel.h clean: $(call msg,CLEAN) $(Q)rm -rf $(OUTPUT) iterators -iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) +iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) $(call msg,GEN-SKEL,$@) - $(Q)$(BPFTOOL) gen skeleton $< > $@ + $(Q)$(BPFTOOL) gen skeleton -L $< > $@ $(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT) diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h deleted file mode 100644 index 8464d1a48c05..000000000000 --- a/kernel/bpf/preload/iterators/bpf_preload_common.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BPF_PRELOAD_COMMON_H -#define _BPF_PRELOAD_COMMON_H - -#define BPF_PRELOAD_START 0x5555 -#define BPF_PRELOAD_END 0xAAAA - -struct bpf_preload_info { - char link_name[16]; - int link_id; -}; - -#endif diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c deleted file mode 100644 index 5d872a705470..000000000000 --- a/kernel/bpf/preload/iterators/iterators.c +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2020 Facebook */ -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/resource.h> -#include <bpf/libbpf.h> -#include <bpf/bpf.h> -#include <sys/mount.h> -#include "iterators.skel.h" -#include "bpf_preload_common.h" - -int to_kernel = -1; -int from_kernel = 0; - -static int send_link_to_kernel(struct bpf_link *link, const char *link_name) -{ - struct bpf_preload_info obj = {}; - struct bpf_link_info info = {}; - __u32 info_len = sizeof(info); - int err; - - err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len); - if (err) - return err; - obj.link_id = info.id; - if (strlen(link_name) >= sizeof(obj.link_name)) - return -E2BIG; - strcpy(obj.link_name, link_name); - if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj)) - return -EPIPE; - return 0; -} - -int main(int argc, char **argv) -{ - struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY }; - struct iterators_bpf *skel; - int err, magic; - int debug_fd; - - debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC); - if (debug_fd < 0) - return 1; - to_kernel = dup(1); - close(1); - dup(debug_fd); - /* now stdin and stderr point to /dev/console */ - - read(from_kernel, &magic, sizeof(magic)); - if (magic != BPF_PRELOAD_START) { - printf("bad start magic %d\n", magic); - return 1; - } - setrlimit(RLIMIT_MEMLOCK, &rlim); - /* libbpf opens BPF object and loads it into the kernel */ - skel = iterators_bpf__open_and_load(); - if (!skel) { - /* iterators.skel.h is little endian. - * libbpf doesn't support automatic little->big conversion - * of BPF bytecode yet. - * The program load will fail in such case. - */ - printf("Failed load could be due to wrong endianness\n"); - return 1; - } - err = iterators_bpf__attach(skel); - if (err) - goto cleanup; - - /* send two bpf_link IDs with names to the kernel */ - err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug"); - if (err) - goto cleanup; - err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug"); - if (err) - goto cleanup; - - /* The kernel will proceed with pinnging the links in bpffs. - * UMD will wait on read from pipe. - */ - read(from_kernel, &magic, sizeof(magic)); - if (magic != BPF_PRELOAD_END) { - printf("bad final magic %d\n", magic); - err = -EINVAL; - } -cleanup: - iterators_bpf__destroy(skel); - - return err != 0; -} diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h new file mode 100644 index 000000000000..70f236a82fe1 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.lskel.h @@ -0,0 +1,425 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* THIS FILE IS AUTOGENERATED! */ +#ifndef __ITERATORS_BPF_SKEL_H__ +#define __ITERATORS_BPF_SKEL_H__ + +#include <bpf/skel_internal.h> + +struct iterators_bpf { + struct bpf_loader_ctx ctx; + struct { + struct bpf_map_desc rodata; + } maps; + struct { + struct bpf_prog_desc dump_bpf_map; + struct bpf_prog_desc dump_bpf_prog; + } progs; + struct { + int dump_bpf_map_fd; + int dump_bpf_prog_fd; + } links; + struct iterators_bpf__rodata { + } *rodata; +}; + +static inline int +iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel) +{ + int prog_fd = skel->progs.dump_bpf_map.prog_fd; + int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER); + + if (fd > 0) + skel->links.dump_bpf_map_fd = fd; + return fd; +} + +static inline int +iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel) +{ + int prog_fd = skel->progs.dump_bpf_prog.prog_fd; + int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER); + + if (fd > 0) + skel->links.dump_bpf_prog_fd = fd; + return fd; +} + +static inline int +iterators_bpf__attach(struct iterators_bpf *skel) +{ + int ret = 0; + + ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel); + ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel); + return ret < 0 ? ret : 0; +} + +static inline void +iterators_bpf__detach(struct iterators_bpf *skel) +{ + skel_closenz(skel->links.dump_bpf_map_fd); + skel_closenz(skel->links.dump_bpf_prog_fd); +} +static void +iterators_bpf__destroy(struct iterators_bpf *skel) +{ + if (!skel) + return; + iterators_bpf__detach(skel); + skel_closenz(skel->progs.dump_bpf_map.prog_fd); + skel_closenz(skel->progs.dump_bpf_prog.prog_fd); + skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096); + skel_closenz(skel->maps.rodata.map_fd); + skel_free(skel); +} +static inline struct iterators_bpf * +iterators_bpf__open(void) +{ + struct iterators_bpf *skel; + + skel = skel_alloc(sizeof(*skel)); + if (!skel) + goto cleanup; + skel->ctx.sz = (void *)&skel->links - (void *)skel; + skel->rodata = skel_prep_map_data((void *)"\ +\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\ +\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\ +\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98); + if (!skel->rodata) + goto cleanup; + skel->maps.rodata.initial_value = (__u64) (long) skel->rodata; + return skel; +cleanup: + iterators_bpf__destroy(skel); + return NULL; +} + +static inline int +iterators_bpf__load(struct iterators_bpf *skel) +{ + struct bpf_load_and_run_opts opts = {}; + int err; + + opts.ctx = (struct bpf_loader_ctx *)skel; + opts.data_sz = 6056; + opts.data = (void *)"\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\ +\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\ +\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\ +\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\ +\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\ +\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\ +\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\ +\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\ +\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\ +\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\ +\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\ +\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\ +\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\ +\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\ +\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\ +\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\ +\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\ +\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\ +\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\ +\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\ +\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\ +\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\ +\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\ +\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\ +\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\ +\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\ +\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\ +\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\ +\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\ +\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\ +\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\ +\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\ +\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\ +\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\ +\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\ +\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\ +\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\ +\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\ +\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\ +\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\ +\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\ +\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\ +\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\ +\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\ +\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\ +\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\ +\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\ +\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\ +\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\ +\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\ +\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ +\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ +\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ +\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ +\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ +\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ +\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ +\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ +\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ +\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ +\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ +\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ +\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ +\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ +\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ +\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ +\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ +\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ +\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ +\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ +\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ +\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ +\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ +\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ +\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ +\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ +\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ +\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ +\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ +\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ +\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ +\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ +\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ +\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ +\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ +\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ +\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ +\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\ +\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\ +\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\ +\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\ +\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\ +\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\ +\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\ +\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\ +\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\ +\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\ +\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\ +\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\ +\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\ +\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\ +\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\ +\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\ +\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\ +\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\ +\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\ +\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\ +\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\ +\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\ +\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\ +\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\ +\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\ +\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\ +\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\ +\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\ +\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\ +\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\ +\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\ +\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\ +\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\ +\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\ +\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\ +\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\ +\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\ +\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\ +\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\ +\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\ +\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\ +\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\ +\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\ +\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\ +\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\ +\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\ +\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\ +\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\ +\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\ +\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\ +\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\ +\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\ +\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\ +\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\ +\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\ +\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\ +\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\ +\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\ +\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\ +\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\ +\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\ +\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\ +\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\ +\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\ +\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\ +\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\ +\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\ +\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\ +\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\ +\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\ +\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\ +\0\0\0\0"; + opts.insns_sz = 2216; + opts.insns = (void *)"\ +\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\ +\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\ +\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\ +\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\ +\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\ +\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\ +\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\ +\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\ +\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\ +\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\ +\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\ +\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\ +\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\ +\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\ +\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\ +\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\ +\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\ +\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\ +\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\ +\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ +\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\ +\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\ +\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\ +\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\ +\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\ +\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\ +\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\ +\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\ +\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\ +\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\ +\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\ +\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\ +\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\ +\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\ +\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\ +\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\ +\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\ +\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\ +\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\ +\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\ +\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\ +\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\ +\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\ +\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\ +\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\ +\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\ +\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\ +\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\ +\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\ +\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\ +\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\ +\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\ +\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\ +\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\ +\0\0\0\0\x95\0\0\0\0\0\0\0"; + err = bpf_load_and_run(&opts); + if (err < 0) + return err; + skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value, + 4096, PROT_READ, skel->maps.rodata.map_fd); + if (!skel->rodata) + return -ENOMEM; + return 0; +} + +static inline struct iterators_bpf * +iterators_bpf__open_and_load(void) +{ + struct iterators_bpf *skel; + + skel = iterators_bpf__open(); + if (!skel) + return NULL; + if (iterators_bpf__load(skel)) { + iterators_bpf__destroy(skel); + return NULL; + } + return skel; +} + +#endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h deleted file mode 100644 index cf9a6a94b3a4..000000000000 --- a/kernel/bpf/preload/iterators/iterators.skel.h +++ /dev/null @@ -1,412 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ - -/* THIS FILE IS AUTOGENERATED! */ -#ifndef __ITERATORS_BPF_SKEL_H__ -#define __ITERATORS_BPF_SKEL_H__ - -#include <stdlib.h> -#include <bpf/libbpf.h> - -struct iterators_bpf { - struct bpf_object_skeleton *skeleton; - struct bpf_object *obj; - struct { - struct bpf_map *rodata; - } maps; - struct { - struct bpf_program *dump_bpf_map; - struct bpf_program *dump_bpf_prog; - } progs; - struct { - struct bpf_link *dump_bpf_map; - struct bpf_link *dump_bpf_prog; - } links; - struct iterators_bpf__rodata { - char dump_bpf_map____fmt[35]; - char dump_bpf_map____fmt_1[14]; - char dump_bpf_prog____fmt[32]; - char dump_bpf_prog____fmt_2[17]; - } *rodata; -}; - -static void -iterators_bpf__destroy(struct iterators_bpf *obj) -{ - if (!obj) - return; - if (obj->skeleton) - bpf_object__destroy_skeleton(obj->skeleton); - free(obj); -} - -static inline int -iterators_bpf__create_skeleton(struct iterators_bpf *obj); - -static inline struct iterators_bpf * -iterators_bpf__open_opts(const struct bpf_object_open_opts *opts) -{ - struct iterators_bpf *obj; - - obj = (struct iterators_bpf *)calloc(1, sizeof(*obj)); - if (!obj) - return NULL; - if (iterators_bpf__create_skeleton(obj)) - goto err; - if (bpf_object__open_skeleton(obj->skeleton, opts)) - goto err; - - return obj; -err: - iterators_bpf__destroy(obj); - return NULL; -} - -static inline struct iterators_bpf * -iterators_bpf__open(void) -{ - return iterators_bpf__open_opts(NULL); -} - -static inline int -iterators_bpf__load(struct iterators_bpf *obj) -{ - return bpf_object__load_skeleton(obj->skeleton); -} - -static inline struct iterators_bpf * -iterators_bpf__open_and_load(void) -{ - struct iterators_bpf *obj; - - obj = iterators_bpf__open(); - if (!obj) - return NULL; - if (iterators_bpf__load(obj)) { - iterators_bpf__destroy(obj); - return NULL; - } - return obj; -} - -static inline int -iterators_bpf__attach(struct iterators_bpf *obj) -{ - return bpf_object__attach_skeleton(obj->skeleton); -} - -static inline void -iterators_bpf__detach(struct iterators_bpf *obj) -{ - return bpf_object__detach_skeleton(obj->skeleton); -} - -static inline int -iterators_bpf__create_skeleton(struct iterators_bpf *obj) -{ - struct bpf_object_skeleton *s; - - s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s)); - if (!s) - return -1; - obj->skeleton = s; - - s->sz = sizeof(*s); - s->name = "iterators_bpf"; - s->obj = &obj->obj; - - /* maps */ - s->map_cnt = 1; - s->map_skel_sz = sizeof(*s->maps); - s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz); - if (!s->maps) - goto err; - - s->maps[0].name = "iterator.rodata"; - s->maps[0].map = &obj->maps.rodata; - s->maps[0].mmaped = (void **)&obj->rodata; - - /* programs */ - s->prog_cnt = 2; - s->prog_skel_sz = sizeof(*s->progs); - s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz); - if (!s->progs) - goto err; - - s->progs[0].name = "dump_bpf_map"; - s->progs[0].prog = &obj->progs.dump_bpf_map; - s->progs[0].link = &obj->links.dump_bpf_map; - - s->progs[1].name = "dump_bpf_prog"; - s->progs[1].prog = &obj->progs.dump_bpf_prog; - s->progs[1].link = &obj->links.dump_bpf_prog; - - s->data_sz = 7176; - s->data = (void *)"\ -\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\ -\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\ -\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\ -\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\ -\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\ -\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\ -\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\ -\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\ -\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\ -\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\ -\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\ -\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\ -\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\ -\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\ -\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\ -\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\ -\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\ -\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\ -\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\ -\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\ -\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\ -\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\ -\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\ -\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\ -\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\ -\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\ -\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\ -\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\ -\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\ -\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\ -\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\ -\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\ -\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\ -\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\ -\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\ -\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\ -\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\ -\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\ -\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\ -\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\ -\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\ -\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\ -\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\ -\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\ -\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ -\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\ -\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\ -\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\ -\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\ -\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\ -\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\ -\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\ -\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\ -\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\ -\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\ -\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\ -\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\ -\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\ -\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ -\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\ -\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\ -\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\ -\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\ -\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\ -\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\ -\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\ -\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\ -\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\ -\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\ -\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\ -\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\ -\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\ -\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\ -\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\ -\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\ -\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\ -\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\ -\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\ -\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\ -\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\ -\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\ -\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ -\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\ -\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\ -\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ -\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ -\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ -\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ -\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ -\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ -\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ -\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ -\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ -\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ -\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ -\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ -\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ -\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ -\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ -\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ -\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ -\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ -\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ -\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ -\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ -\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ -\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ -\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ -\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ -\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ -\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ -\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ -\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ -\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ -\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ -\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ -\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ -\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ -\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ -\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ -\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ -\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ -\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ -\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ -\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\ -\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\ -\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\ -\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\ -\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\ -\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\ -\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\ -\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\ -\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\ -\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\ -\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\ -\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\ -\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\ -\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\ -\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\ -\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\ -\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\ -\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\ -\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\ -\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\ -\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\ -\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\ -\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\ -\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\ -\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\ -\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\ -\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\ -\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\ -\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\ -\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\ -\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\ -\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\ -\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\ -\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\ -\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\ -\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\ -\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\ -\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\ -\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\ -\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\ -\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\ -\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\ -\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\ -\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\ -\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\ -\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\ -\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\ -\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\ -\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\ -\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\ -\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\ -\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\ -\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\ -\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\ -\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\ -\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\ -\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\ -\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\ -\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\ -\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\ -\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\ -\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\ -\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\ -\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\ -\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\ -\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\ -\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\ -\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\ -\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\ -\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\ -\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\ -\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\ -\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\ -\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\ -\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\ -\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\ -\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\ -\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\ -\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\ -\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\ -\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\ -\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\ -\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; - - return 0; -err: - bpf_object__destroy_skeleton(s); - return -1; -} - -#endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 556a769b5b80..8251243022a2 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -143,7 +143,7 @@ static void reuseport_array_free(struct bpf_map *map) /* * Once reaching here, all sk->sk_user_data is not - * referenceing this "array". "array" can be freed now. + * referencing this "array". "array" can be freed now. */ bpf_map_area_free(array); } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 638d7fd7b375..710ba9de12ce 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -104,7 +104,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) } rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, - VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + VM_MAP | VM_USERMAP, PAGE_KERNEL); if (rb) { kmemleak_not_leak(pages); rb->pages = pages; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 49e567209c6b..34725bfa1e97 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -132,7 +132,8 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev_vma = NULL; + const char *prev_build_id; /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with @@ -150,6 +151,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } for (i = 0; i < trace_nr; i++) { + if (range_in_vma(prev_vma, ips[i], ips[i])) { + vma = prev_vma; + memcpy(id_offs[i].build_id, prev_build_id, + BUILD_ID_SIZE_MAX); + goto build_id_valid; + } vma = find_vma(current->mm, ips[i]); if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ @@ -158,15 +165,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } +build_id_valid: id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + prev_vma = vma; + prev_build_id = id_offs[i].build_id; } bpf_mmap_unlock_mm(work, current->mm); } static struct perf_callchain_entry * -get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) { #ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; @@ -177,9 +187,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) if (!entry) return NULL; - entry->nr = init_nr + - stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), - sysctl_perf_event_max_stack - init_nr, 0); + entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, + max_depth, 0); /* stack_trace_save_tsk() works on unsigned long array, while * perf_callchain_entry uses u64 array. For 32-bit systems, it is @@ -191,7 +200,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) int i; /* copy data from the end to avoid using extra buffer */ - for (i = entry->nr - 1; i >= (int)init_nr; i--) + for (i = entry->nr - 1; i >= 0; i--) to[i] = (u64)(from[i]); } @@ -208,27 +217,19 @@ static long __bpf_get_stackid(struct bpf_map *map, { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / stack_map_data_size(map); - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ - u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; - /* get_perf_callchain() guarantees that trace->nr >= init_nr - * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth - */ - trace_nr = trace->nr - init_nr; - - if (trace_nr <= skip) + if (trace->nr <= skip) /* skipping more than usable stack trace */ return -EFAULT; - trace_nr -= skip; + trace_nr = trace->nr - skip; trace_len = trace_nr * sizeof(u64); - ips = trace->ip + skip + init_nr; + ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); @@ -285,8 +286,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { u32 max_depth = map->value_size / stack_map_data_size(map); - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ - u32 init_nr = sysctl_perf_event_max_stack - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; @@ -295,8 +295,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + max_depth += skip; + if (max_depth > sysctl_perf_event_max_stack) + max_depth = sysctl_perf_event_max_stack; + + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -387,7 +391,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags) { - u32 init_nr, trace_nr, copy_len, elem_size, num_elem; + u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; @@ -412,30 +416,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, goto err_fault; num_elem = size / elem_size; - if (sysctl_perf_event_max_stack < num_elem) - init_nr = 0; - else - init_nr = sysctl_perf_event_max_stack - num_elem; + max_depth = num_elem + skip; + if (sysctl_perf_event_max_stack < max_depth) + max_depth = sysctl_perf_event_max_stack; if (trace_in) trace = trace_in; else if (kernel && task) - trace = get_callchain_entry_for_task(task, init_nr); + trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, false, false); if (unlikely(!trace)) goto err_fault; - trace_nr = trace->nr - init_nr; - if (trace_nr < skip) + if (trace->nr < skip) goto err_fault; - trace_nr -= skip; + trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; - ips = trace->ip + skip + init_nr; + + ips = trace->ip + skip; if (user && user_build_id) stack_map_get_build_id_offset(buf, ips, trace_nr, user); else @@ -472,13 +474,14 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { struct pt_regs *regs; - long res; + long res = -EINVAL; if (!try_get_task_stack(task)) return -EFAULT; regs = task_pt_regs(task); - res = __bpf_get_stack(regs, task, NULL, buf, size, flags); + if (regs) + res = __bpf_get_stack(regs, task, NULL, buf, size, flags); put_task_stack(task); return res; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fa4505f9b611..cdaa1152436a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -32,6 +32,7 @@ #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> +#include <linux/trace_events.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -556,16 +557,14 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { - const struct bpf_map *map = filp->private_data; - const struct bpf_array *array; + struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { - array = container_of(map, struct bpf_array, map); - spin_lock(&array->aux->owner.lock); - type = array->aux->owner.type; - jited = array->aux->owner.jited; - spin_unlock(&array->aux->owner.lock); + if (map_type_contains_progs(map)) { + spin_lock(&map->owner.lock); + type = map->owner.type; + jited = map->owner.jited; + spin_unlock(&map->owner.lock); } seq_printf(m, @@ -874,6 +873,7 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); + spin_lock_init(&map->owner.lock); map->spin_lock_off = -EINVAL; map->timer_off = -EINVAL; @@ -986,6 +986,7 @@ struct bpf_map *bpf_map_get(u32 ufd) return map; } +EXPORT_SYMBOL(bpf_map_get); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -1352,14 +1353,16 @@ int generic_map_delete_batch(struct bpf_map *map, err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); if (err) break; + cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; kvfree(key); + + maybe_wait_bpf_programs(map); return err; } @@ -1412,6 +1415,7 @@ int generic_map_update_batch(struct bpf_map *map, if (err) break; + cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) @@ -1509,6 +1513,7 @@ int generic_map_lookup_batch(struct bpf_map *map, swap(prev_key, key); retry = MAP_LOOKUP_RETRIES; cp++; + cond_resched(); } if (err == -EFAULT) @@ -2217,7 +2222,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | BPF_F_SLEEPABLE | - BPF_F_TEST_RND_HI32)) + BPF_F_TEST_RND_HI32 | + BPF_F_XDP_HAS_FRAGS)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -2303,6 +2309,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->aux->dst_prog = dst_prog; prog->aux->offload_requested = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; + prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; err = security_bpf_prog_alloc(prog->aux); if (err) @@ -2488,6 +2495,7 @@ void bpf_link_put(struct bpf_link *link) bpf_link_free(link); } } +EXPORT_SYMBOL(bpf_link_put); static int bpf_link_release(struct inode *inode, struct file *filp) { @@ -2559,7 +2567,7 @@ static int bpf_link_alloc_id(struct bpf_link *link) * pre-allocated resources are to be freed with bpf_cleanup() call. All the * transient state is passed around in struct bpf_link_primer. * This is preferred way to create and initialize bpf_link, especially when - * there are complicated and expensive operations inbetween creating bpf_link + * there are complicated and expensive operations in between creating bpf_link * itself and attaching it to BPF hook. By using bpf_link_prime() and * bpf_link_settle() kernel code using bpf_link doesn't have to perform * expensive (and potentially failing) roll back operations in a rare case @@ -2630,6 +2638,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) return link; } +EXPORT_SYMBOL(bpf_link_get_from_fd); struct bpf_tracing_link { struct bpf_link link; @@ -3014,6 +3023,11 @@ out_put_file: fput(perf_file); return err; } +#else +static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_PERF_EVENTS */ #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd @@ -3318,12 +3332,17 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_FLOW_DISSECTOR: case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_MSG_VERDICT: + case BPF_SK_SKB_VERDICT: + return sock_map_bpf_prog_query(attr, uattr); default: return -EINVAL; } } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu +#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -4242,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr, return -EINVAL; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len +#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies static int link_create(union bpf_attr *attr, bpfptr_t uattr) { enum bpf_prog_type ptype; @@ -4266,7 +4285,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = tracing_bpf_link_attach(attr, uattr, prog); goto out; case BPF_PROG_TYPE_PERF_EVENT: - case BPF_PROG_TYPE_KPROBE: case BPF_PROG_TYPE_TRACEPOINT: if (attr->link_create.attach_type != BPF_PERF_EVENT) { ret = -EINVAL; @@ -4274,6 +4292,14 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) } ptype = prog->type; break; + case BPF_PROG_TYPE_KPROBE: + if (attr->link_create.attach_type != BPF_PERF_EVENT && + attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) { + ret = -EINVAL; + goto out; + } + ptype = prog->type; + break; default: ptype = attach_type_to_prog_type(attr->link_create.attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { @@ -4305,13 +4331,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_xdp_link_attach(attr, prog); break; #endif -#ifdef CONFIG_PERF_EVENTS case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: - case BPF_PROG_TYPE_KPROBE: ret = bpf_perf_link_attach(attr, prog); break; -#endif + case BPF_PROG_TYPE_KPROBE: + if (attr->link_create.attach_type == BPF_PERF_EVENT) + ret = bpf_perf_link_attach(attr, prog); + else + ret = bpf_kprobe_multi_link_attach(attr, prog); + break; default: ret = -EINVAL; } @@ -4750,23 +4779,52 @@ static bool syscall_prog_is_valid_access(int off, int size, return true; } -BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size) +BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { + struct bpf_prog * __maybe_unused prog; + switch (cmd) { case BPF_MAP_CREATE: case BPF_MAP_UPDATE_ELEM: case BPF_MAP_FREEZE: case BPF_PROG_LOAD: case BPF_BTF_LOAD: + case BPF_LINK_CREATE: + case BPF_RAW_TRACEPOINT_OPEN: break; - /* case BPF_PROG_TEST_RUN: - * is not part of this list to prevent recursive test_run - */ +#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ + case BPF_PROG_TEST_RUN: + if (attr->test.data_in || attr->test.data_out || + attr->test.ctx_out || attr->test.duration || + attr->test.repeat || attr->test.flags) + return -EINVAL; + + prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || + attr->test.ctx_size_in > U16_MAX) { + bpf_prog_put(prog); + return -EINVAL; + } + + if (!__bpf_prog_enter_sleepable(prog)) { + /* recursion detected */ + bpf_prog_put(prog); + return -EBUSY; + } + attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); + __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */); + bpf_prog_put(prog); + return 0; +#endif default: return -EINVAL; } return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } +EXPORT_SYMBOL(bpf_sys_bpf); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 4b6974a195c1..ada97751ae1b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -45,7 +45,7 @@ void *bpf_jit_alloc_exec_page(void) set_vm_flush_reset_perms(image); /* Keep image as writeable. The alternative is to keep flipping ro/rw - * everytime new program is attached or detached. + * every time new program is attached or detached. */ set_memory_x((long)image, 1); return image; @@ -117,18 +117,6 @@ static void bpf_trampoline_module_put(struct bpf_trampoline *tr) tr->mod = NULL; } -static int is_ftrace_location(void *ip) -{ - long addr; - - addr = ftrace_location((long)ip); - if (!addr) - return 0; - if (WARN_ON_ONCE(addr != (long)ip)) - return -EFAULT; - return 1; -} - static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) { void *ip = tr->func.addr; @@ -160,12 +148,12 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad static int register_fentry(struct bpf_trampoline *tr, void *new_addr) { void *ip = tr->func.addr; + unsigned long faddr; int ret; - ret = is_ftrace_location(ip); - if (ret < 0) - return ret; - tr->func.ftrace_managed = ret; + faddr = ftrace_location((unsigned long)ip); + if (faddr) + tr->func.ftrace_managed = true; if (bpf_trampoline_module_get(tr)) return -ENOENT; @@ -213,7 +201,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work) im = container_of(work, struct bpf_tramp_image, work); bpf_image_ksym_del(&im->ksym); bpf_jit_free_exec(im->image); - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } @@ -310,7 +298,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) if (!im) goto out; - err = bpf_jit_charge_modmem(1); + err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) goto out_free_im; @@ -332,7 +320,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) out_free_image: bpf_jit_free_exec(im->image); out_uncharge: - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); out_free_im: kfree(im); out: @@ -550,11 +538,12 @@ static __always_inline u64 notrace bpf_prog_start_time(void) static void notrace inc_misses_counter(struct bpf_prog *prog) { struct bpf_prog_stats *stats; + unsigned int flags; stats = this_cpu_ptr(prog->stats); - u64_stats_update_begin(&stats->syncp); + flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); - u64_stats_update_end(&stats->syncp); + u64_stats_update_end_irqrestore(&stats->syncp, flags); } /* The logic is similar to bpf_prog_run(), but with an explicit diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bfb45381fb3f..d175b70067b3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -452,7 +452,8 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { return base_type(type) == PTR_TO_SOCKET || base_type(type) == PTR_TO_TCP_SOCK || - base_type(type) == PTR_TO_MEM; + base_type(type) == PTR_TO_MEM || + base_type(type) == PTR_TO_BTF_ID; } static bool type_is_rdonly_mem(u32 type) @@ -535,10 +536,10 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) static const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) { - char postfix[16] = {0}, prefix[16] = {0}; + char postfix[16] = {0}, prefix[32] = {0}; static const char * const str[] = { [NOT_INIT] = "?", - [SCALAR_VALUE] = "inv", + [SCALAR_VALUE] = "scalar", [PTR_TO_CTX] = "ctx", [CONST_PTR_TO_MAP] = "map_ptr", [PTR_TO_MAP_VALUE] = "map_value", @@ -553,7 +554,6 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", @@ -561,15 +561,20 @@ static const char *reg_type_str(struct bpf_verifier_env *env, }; if (type & PTR_MAYBE_NULL) { - if (base_type(type) == PTR_TO_BTF_ID || - base_type(type) == PTR_TO_PERCPU_BTF_ID) + if (base_type(type) == PTR_TO_BTF_ID) strncpy(postfix, "or_null_", 16); else strncpy(postfix, "_or_null", 16); } if (type & MEM_RDONLY) - strncpy(prefix, "rdonly_", 16); + strncpy(prefix, "rdonly_", 32); + if (type & MEM_ALLOC) + strncpy(prefix, "alloc_", 32); + if (type & MEM_USER) + strncpy(prefix, "user_", 32); + if (type & MEM_PERCPU) + strncpy(prefix, "percpu_", 32); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -616,7 +621,7 @@ static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) { - env->scratched_stack_slots |= 1UL << spi; + env->scratched_stack_slots |= 1ULL << spi; } static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) @@ -637,14 +642,14 @@ static bool verifier_state_scratched(const struct bpf_verifier_env *env) static void mark_verifier_state_clean(struct bpf_verifier_env *env) { env->scratched_regs = 0U; - env->scratched_stack_slots = 0UL; + env->scratched_stack_slots = 0ULL; } /* Used for printing the entire verifier state. */ static void mark_verifier_state_scratched(struct bpf_verifier_env *env) { env->scratched_regs = ~0U; - env->scratched_stack_slots = ~0UL; + env->scratched_stack_slots = ~0ULL; } /* The reg state of a pointer or a bounded scalar was saved when @@ -680,74 +685,79 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " R%d", i); print_liveness(env, reg->live); - verbose(env, "=%s", reg_type_str(env, t)); + verbose(env, "="); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ + verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (base_type(t) == PTR_TO_BTF_ID || - base_type(t) == PTR_TO_PERCPU_BTF_ID) + const char *sep = ""; + + verbose(env, "%s", reg_type_str(env, t)); + if (base_type(t) == PTR_TO_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); - verbose(env, "(id=%d", reg->id); - if (reg_type_may_be_refcounted_or_null(t)) - verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); + verbose(env, "("); +/* + * _a stands for append, was shortened to avoid multiline statements below. + * This macro is used to output a comma separated list of attributes. + */ +#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; }) + + if (reg->id) + verbose_a("id=%d", reg->id); + if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id) + verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) - verbose(env, ",off=%d", reg->off); + verbose_a("off=%d", reg->off); if (type_is_pkt_pointer(t)) - verbose(env, ",r=%d", reg->range); + verbose_a("r=%d", reg->range); else if (base_type(t) == CONST_PTR_TO_MAP || base_type(t) == PTR_TO_MAP_KEY || base_type(t) == PTR_TO_MAP_VALUE) - verbose(env, ",ks=%d,vs=%d", - reg->map_ptr->key_size, - reg->map_ptr->value_size); + verbose_a("ks=%d,vs=%d", + reg->map_ptr->key_size, + reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { /* Typically an immediate SCALAR_VALUE, but * could be a pointer whose offset is too big * for reg->off */ - verbose(env, ",imm=%llx", reg->var_off.value); + verbose_a("imm=%llx", reg->var_off.value); } else { if (reg->smin_value != reg->umin_value && reg->smin_value != S64_MIN) - verbose(env, ",smin_value=%lld", - (long long)reg->smin_value); + verbose_a("smin=%lld", (long long)reg->smin_value); if (reg->smax_value != reg->umax_value && reg->smax_value != S64_MAX) - verbose(env, ",smax_value=%lld", - (long long)reg->smax_value); + verbose_a("smax=%lld", (long long)reg->smax_value); if (reg->umin_value != 0) - verbose(env, ",umin_value=%llu", - (unsigned long long)reg->umin_value); + verbose_a("umin=%llu", (unsigned long long)reg->umin_value); if (reg->umax_value != U64_MAX) - verbose(env, ",umax_value=%llu", - (unsigned long long)reg->umax_value); + verbose_a("umax=%llu", (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, ",var_off=%s", tn_buf); + verbose_a("var_off=%s", tn_buf); } if (reg->s32_min_value != reg->smin_value && reg->s32_min_value != S32_MIN) - verbose(env, ",s32_min_value=%d", - (int)(reg->s32_min_value)); + verbose_a("s32_min=%d", (int)(reg->s32_min_value)); if (reg->s32_max_value != reg->smax_value && reg->s32_max_value != S32_MAX) - verbose(env, ",s32_max_value=%d", - (int)(reg->s32_max_value)); + verbose_a("s32_max=%d", (int)(reg->s32_max_value)); if (reg->u32_min_value != reg->umin_value && reg->u32_min_value != U32_MIN) - verbose(env, ",u32_min_value=%d", - (int)(reg->u32_min_value)); + verbose_a("u32_min=%d", (int)(reg->u32_min_value)); if (reg->u32_max_value != reg->umax_value && reg->u32_max_value != U32_MAX) - verbose(env, ",u32_max_value=%d", - (int)(reg->u32_max_value)); + verbose_a("u32_max=%d", (int)(reg->u32_max_value)); } +#undef verbose_a + verbose(env, ")"); } } @@ -772,7 +782,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (is_spilled_reg(&state->stack[i])) { reg = &state->stack[i].spilled_ptr; t = reg->type; - verbose(env, "=%s", reg_type_str(env, t)); + verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) @@ -1544,14 +1554,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, static void mark_btf_ld_reg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, enum bpf_reg_type reg_type, - struct btf *btf, u32 btf_id) + struct btf *btf, u32 btf_id, + enum bpf_type_flag flag) { if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, regno); return; } mark_reg_known_zero(env, regs, regno); - regs[regno].type = PTR_TO_BTF_ID; + regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; } @@ -1741,7 +1752,7 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) } static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, - s16 offset, struct module **btf_modp) + s16 offset) { struct bpf_kfunc_btf kf_btf = { .offset = offset }; struct bpf_kfunc_btf_tab *tab; @@ -1795,8 +1806,6 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); } - if (btf_modp) - *btf_modp = b->module; return b->btf; } @@ -1813,8 +1822,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) } static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, - u32 func_id, s16 offset, - struct module **btf_modp) + u32 func_id, s16 offset) { if (offset) { if (offset < 0) { @@ -1825,7 +1833,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, return ERR_PTR(-EINVAL); } - return __find_kfunc_desc_btf(env, offset, btf_modp); + return __find_kfunc_desc_btf(env, offset); } return btf_vmlinux ?: ERR_PTR(-ENOENT); } @@ -1839,6 +1847,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) struct bpf_kfunc_desc *desc; const char *func_name; struct btf *desc_btf; + unsigned long call_imm; unsigned long addr; int err; @@ -1888,7 +1897,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL); + desc_btf = find_kfunc_desc_btf(env, func_id, offset); if (IS_ERR(desc_btf)) { verbose(env, "failed to find BTF for kernel function\n"); return PTR_ERR(desc_btf); @@ -1923,9 +1932,17 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } + call_imm = BPF_CALL_IMM(addr); + /* Check whether or not the relative offset overflows desc->imm */ + if ((unsigned long)(s32)call_imm != call_imm) { + verbose(env, "address of kernel function %s is out of range\n", + func_name); + return -EINVAL; + } + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; - desc->imm = BPF_CALL_IMM(addr); + desc->imm = call_imm; desc->offset = offset; err = btf_distill_func_proto(&env->log, desc_btf, func_proto, func_name, @@ -2349,7 +2366,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) return NULL; - desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL); + desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off); if (IS_ERR(desc_btf)) return "<error>"; @@ -2765,7 +2782,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: case PTR_TO_BUF: - case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: @@ -3496,11 +3512,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, #define MAX_PACKET_OFF 0xffff -static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) -{ - return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; -} - static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_access_type t) @@ -3969,16 +3980,23 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) { - /* Access to ctx or passing it to a helper is only allowed in - * its original, unmodified form. + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. */ - if (reg->off) { - verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", - regno, reg->off); + if (reg->off < 0) { + verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + + if (!fixed_off_ok && reg->off) { + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); return -EACCES; } @@ -3986,13 +4004,20 @@ int check_ctx_reg(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } return 0; } +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -4037,9 +4062,9 @@ static int check_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, int off, int size, bool zero_size_allowed, - const char *buf_info, u32 *max_access) { + const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr"; int err; err = __check_buffer_access(env, buf_info, reg, regno, off, size); @@ -4149,6 +4174,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); + enum bpf_type_flag flag = 0; u32 btf_id; int ret; @@ -4168,9 +4194,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + if (reg->type & MEM_USER) { + verbose(env, + "R%d is ptr_%s access user memory: off=%d\n", + regno, tname, off); + return -EACCES; + } + + if (reg->type & MEM_PERCPU) { + verbose(env, + "R%d is ptr_%s access percpu memory: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (env->ops->btf_struct_access) { ret = env->ops->btf_struct_access(&env->log, reg->btf, t, - off, size, atype, &btf_id); + off, size, atype, &btf_id, &flag); } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); @@ -4178,14 +4218,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } ret = btf_struct_access(&env->log, reg->btf, t, off, size, - atype, &btf_id); + atype, &btf_id, &flag); } if (ret < 0) return ret; if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); return 0; } @@ -4198,6 +4238,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; + enum bpf_type_flag flag = 0; const struct btf_type *t; const char *tname; u32 btf_id; @@ -4235,12 +4276,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id); + ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag); if (ret < 0) return ret; if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); return 0; } @@ -4437,11 +4478,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, &btf_id); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, + &btf_id); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -4525,7 +4567,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_tp_buffer_access(env, reg, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_BTF_ID) { + } else if (base_type(reg->type) == PTR_TO_BTF_ID && + !type_may_be_null(reg->type)) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { @@ -4533,7 +4576,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); } else if (base_type(reg->type) == PTR_TO_BUF) { bool rdonly_mem = type_is_rdonly_mem(reg->type); - const char *buf_info; u32 *max_access; if (rdonly_mem) { @@ -4542,15 +4584,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regno, reg_type_str(env, reg->type)); return -EACCES; } - buf_info = "rdonly"; max_access = &env->prog->aux->max_rdonly_access; } else { - buf_info = "rdwr"; max_access = &env->prog->aux->max_rdwr_access; } err = check_buffer_access(env, reg, regno, off, size, false, - buf_info, max_access); + max_access); if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); @@ -4771,7 +4811,7 @@ static int check_stack_range_initialized( } if (is_spilled_reg(&state->stack[spi]) && - state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) + base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID) goto mark; if (is_spilled_reg(&state->stack[spi]) && @@ -4813,7 +4853,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - const char *buf_info; u32 *max_access; switch (base_type(reg->type)) { @@ -4840,15 +4879,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) return -EACCES; - buf_info = "rdonly"; max_access = &env->prog->aux->max_rdonly_access; } else { - buf_info = "rdwr"; max_access = &env->prog->aux->max_rdwr_access; } return check_buffer_access(env, reg, regno, reg->off, access_size, zero_size_allowed, - buf_info, max_access); + max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, @@ -4867,6 +4904,62 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static int check_mem_size_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + bool zero_size_allowed, + struct bpf_call_arg_meta *meta) +{ + int err; + + /* This is used to refine r0 return value bounds for helpers + * that enforce this value as an upper bound on return values. + * See do_refine_retval_range() for helpers that can refine + * the return value. C type of helper is u32 so we pull register + * bound from umax_value however, if negative verifier errors + * out. Only upper bounds can be learned because retval is an + * int type and negative retvals are allowed. + */ + if (meta) + meta->msize_max_value = reg->umax_value; + + /* The register is SCALAR_VALUE; the access check + * happens using its boundaries. + */ + if (!tnum_is_const(reg->var_off)) + /* For unprivileged variable accesses, disable raw + * mode so that the program is required to + * initialize all the memory that the helper could + * just partially fill up. + */ + meta = NULL; + + if (reg->smin_value < 0) { + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } + + if (reg->umin_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); + if (err) + return err; + } + + if (reg->umax_value >= BPF_MAX_VAR_SIZ) { + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->umax_value, + zero_size_allowed, meta); + if (!err) + err = mark_chain_precision(env, regno); + return err; +} + int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size) { @@ -4890,6 +4983,28 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return check_helper_mem_access(env, regno, mem_size, true, NULL); } +int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno) +{ + struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; + bool may_be_null = type_may_be_null(mem_reg->type); + struct bpf_reg_state saved_reg; + int err; + + WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); + + if (may_be_null) { + saved_reg = *mem_reg; + mark_ptr_not_null_reg(mem_reg); + } + + err = check_mem_size_reg(env, reg, regno, true, NULL); + + if (may_be_null) + *mem_reg = saved_reg; + return err; +} + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL * Two bpf_map_lookups (even with the same key) will have different reg->id. @@ -5127,6 +5242,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, + PTR_TO_MEM | MEM_ALLOC, PTR_TO_BUF, }, }; @@ -5144,11 +5260,11 @@ static const struct bpf_reg_types int_ptr_types = { static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } }; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -5244,15 +5360,63 @@ found: kernel_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } + } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", + return 0; +} + +int check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + enum bpf_arg_type arg_type, + bool is_release_func) +{ + bool fixed_off_ok = false, release_reg; + enum bpf_reg_type type = reg->type; + + switch ((u32)type) { + case SCALAR_VALUE: + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_STACK: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (arg_type != ARG_PTR_TO_ALLOC_MEM) + return 0; + break; + /* All the rest must be rejected, except PTR_TO_BTF_ID which allows + * fixed offset. + */ + case PTR_TO_BTF_ID: + /* When referenced PTR_TO_BTF_ID is passed to release function, + * it's fixed offset must be 0. We rely on the property that + * only one referenced register can be passed to BPF helpers and + * kfuncs. In the other cases, fixed offset can be non-zero. + */ + release_reg = is_release_func && reg->ref_obj_id; + if (release_reg && reg->off) { + verbose(env, "R%d must have zero offset when passed to release func\n", regno); - return -EACCES; + return -EINVAL; } + /* For release_reg == true, fixed_off_ok must be false, but we + * already checked and rejected reg->off != 0 above, so set to + * true to allow fixed offset for all other cases. + */ + fixed_off_ok = true; + break; + default: + break; } - - return 0; + return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } static int check_func_arg(struct bpf_verifier_env *env, u32 arg, @@ -5304,13 +5468,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - if (type == PTR_TO_CTX) { - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; - } + err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id)); + if (err) + return err; skip_type_check: + /* check_func_arg_reg_off relies on only one referenced register being + * allowed for BPF helpers. + */ if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -5411,51 +5576,7 @@ skip_type_check: } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* This is used to refine r0 return value bounds for helpers - * that enforce this value as an upper bound on return values. - * See do_refine_retval_range() for helpers that can refine - * the return value. C type of helper is u32 so we pull register - * bound from umax_value however, if negative verifier errors - * out. Only upper bounds can be learned because retval is an - * int type and negative retvals are allowed. - */ - meta->msize_max_value = reg->umax_value; - - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. - */ - if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. - */ - meta = NULL; - - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - regno); - return -EACCES; - } - - if (reg->umin_value == 0) { - err = check_helper_mem_access(env, regno - 1, 0, - zero_size_allowed, - meta); - if (err) - return err; - } - - if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - regno); - return -EACCES; - } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); - if (!err) - err = mark_chain_precision(env, regno); + err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta); } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d is not a known constant'\n", @@ -6814,22 +6935,23 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } -static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; - struct module *btf_mod = NULL; + int err, insn_idx = *insn_idx_p; const struct btf_param *args; struct btf *desc_btf; - int err; + bool acq; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) return 0; - desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod); + desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off); if (IS_ERR(desc_btf)) return PTR_ERR(desc_btf); @@ -6838,23 +6960,43 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) func_name = btf_name_by_offset(desc_btf, func->name_off); func_proto = btf_type_by_id(desc_btf, func->type); - if (!env->ops->check_kfunc_call || - !env->ops->check_kfunc_call(func_id, btf_mod)) { + if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_CHECK, func_id)) { verbose(env, "calling kernel function %s is not allowed\n", func_name); return -EACCES; } + acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_ACQUIRE, func_id); + /* Check the arguments */ err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs); - if (err) + if (err < 0) return err; + /* In case of release function, we get register number of refcounted + * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now + */ + if (err) { + err = release_reference(env, regs[err].ref_obj_id); + if (err) { + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, func_id); + return err; + } + } for (i = 0; i < CALLER_SAVED_REGS; i++) mark_reg_not_init(env, regs, caller_saved[i]); /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); + + if (acq && !btf_type_is_ptr(t)) { + verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); + return -EINVAL; + } + if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); mark_btf_func_reg_size(env, BPF_REG_0, t->size); @@ -6873,7 +7015,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; + if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RET_NULL, func_id)) { + regs[BPF_REG_0].type |= PTR_MAYBE_NULL; + /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ + regs[BPF_REG_0].id = ++env->id_gen; + } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); + if (acq) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + regs[BPF_REG_0].id = id; + regs[BPF_REG_0].ref_obj_id = id; + } } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ nargs = btf_type_vlen(func_proto); @@ -9507,16 +9663,19 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - if (insn->src_reg == BPF_PSEUDO_BTF_ID) { - mark_reg_known_zero(env, regs, insn->dst_reg); + /* All special src_reg cases are listed below. From this point onwards + * we either succeed and assign a corresponding dst_reg->type after + * zeroing the offset, or fail and reject the program. + */ + mark_reg_known_zero(env, regs, insn->dst_reg); + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { dst_reg->type = aux->btf_var.reg_type; switch (base_type(dst_reg->type)) { case PTR_TO_MEM: dst_reg->mem_size = aux->btf_var.mem_size; break; case PTR_TO_BTF_ID: - case PTR_TO_PERCPU_BTF_ID: dst_reg->btf = aux->btf_var.btf; dst_reg->btf_id = aux->btf_var.btf_id; break; @@ -9547,7 +9706,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || @@ -9651,7 +9809,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } - err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + err = check_ptr_off_reg(env, ®s[ctx_reg], ctx_reg); if (err < 0) return err; @@ -10242,8 +10400,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env) aux->func_info[i].insn_off = env->subprog_info[i].start; } -#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ - sizeof(((struct bpf_line_info *)(0))->line_col)) +#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col) #define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE static int check_btf_line(struct bpf_verifier_env *env, @@ -11518,7 +11675,7 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) - err = check_kfunc_call(env, insn); + err = check_kfunc_call(env, insn, &env->insn_idx); else err = check_helper_call(env, insn, &env->insn_idx); if (err) @@ -11717,7 +11874,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, type = t->type; t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { - aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU; aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { @@ -12866,6 +13023,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->blinding_requested = prog->blinding_requested; func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab; func[i]->aux->linfo = prog->aux->linfo; @@ -12961,6 +13119,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; + prog->jited_len = func[0]->jited_len; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog); @@ -12988,6 +13147,7 @@ out_free: out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; + prog->blinding_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (!bpf_pseudo_call(insn)) continue; @@ -13081,7 +13241,6 @@ static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; enum bpf_attach_type eatype = prog->expected_attach_type; - bool expect_blinding = bpf_jit_blinding_enabled(prog); enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; @@ -13245,7 +13404,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->bpf_capable && !expect_blinding && + if (env->bpf_capable && !prog->blinding_requested && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -13333,6 +13492,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } + if (insn->imm == BPF_FUNC_task_storage_get || + insn->imm == BPF_FUNC_sk_storage_get || + insn->imm == BPF_FUNC_inode_storage_get) { + if (env->prog->aux->sleepable) + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); + else + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); + insn_buf[1] = *insn; + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. diff --git a/kernel/capability.c b/kernel/capability.c index 46a361dde042..765194f5d678 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -360,6 +360,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } +EXPORT_SYMBOL(has_capability_noaudit); static bool ns_capable_common(struct user_namespace *ns, int cap, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 41e0837a5a0b..afc6c0e9c966 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -546,9 +546,19 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; + struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + ctx = of->priv; + if ((ctx->ns->user_ns != &init_user_ns) || + !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -954,6 +964,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) + return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e1465868a..adb820e98f24 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1302,7 +1302,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { - struct cgroup *root_cgrp = kf_root->kn->priv; + struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } @@ -2025,7 +2025,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ret = PTR_ERR(root->kf_root); goto exit_root_id; } - root_cgrp->kn = root->kf_root->kn; + root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); @@ -3643,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); + /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { @@ -3650,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&ctx->psi.trigger, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; @@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; - psi_trigger_replace(&ctx->psi.trigger, NULL); + psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) @@ -6161,6 +6166,20 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) if (ret) goto err; + /* + * Spawning a task directly into a cgroup works by passing a file + * descriptor to the target cgroup directory. This can even be an O_PATH + * file descriptor. But it can never be a cgroup.procs file descriptor. + * This was done on purpose so spawning into a cgroup could be + * conceptualized as an atomic + * + * fd = openat(dfd_cgroup, "cgroup.procs", ...); + * write(fd, <child-pid>, ...); + * + * sequence, i.e. it's a shorthand for the caller opening and writing + * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us + * to always use the caller's credentials. + */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, !(kargs->flags & CLONE_THREAD), current->nsproxy->cgroup_ns); @@ -6224,6 +6243,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process + * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). @@ -6286,6 +6306,7 @@ void cgroup_cancel_fork(struct task_struct *child, /** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process + * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks. diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index dc653ab26e50..9390bfd9f1cd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* * There could be abnormal cpuset configurations for cpu or memory - * node binding, add this key to provide a quick low-cost judgement + * node binding, add this key to provide a quick low-cost judgment * of the situation. */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); @@ -591,6 +591,35 @@ static inline void free_cpuset(struct cpuset *cs) } /* + * validate_change_legacy() - Validate conditions specific to legacy (v1) + * behavior. + */ +static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + +/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * @@ -614,20 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; - int ret; - - /* The checks don't apply to root cpuset */ - if (cur == &top_cpuset) - return 0; + int ret = 0; rcu_read_lock(); - par = parent_cs(cur); - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode()) + ret = validate_change_legacy(cur, trial); + if (ret) + goto out; + + /* Remaining checks don't apply to root cpuset */ + if (cur == &top_cpuset) goto out; + par = parent_cs(cur); + /* * If either I or some sibling (!= me) is exclusive, we can't * overlap @@ -803,7 +833,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); goto done; } @@ -833,7 +863,7 @@ static int generate_sched_domains(cpumask_var_t **domains, if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_FLAG_DOMAIN)))) + housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; if (root_load_balance && @@ -922,7 +952,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -1151,7 +1181,7 @@ enum subparts_cmd { * effective_cpus. The function will return 0 if all the CPUs listed in * cpus_allowed can be granted or an error code will be returned. * - * For partcmd_disable, the cpuset is being transofrmed from a partition + * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in cpus_allowed that are in * parent's subparts_cpus will be taken away from that cpumask and put back * into parent's effective_cpus. 0 should always be returned. @@ -1175,9 +1205,7 @@ enum subparts_cmd { * * Because of the implicit cpu exclusive nature of a partition root, * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). The validate_change() - * function will also prevent any changes to the cpu list if it is not - * a superset of children's cpu lists. + * permitted when checked by validate_change(). */ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpumask *newmask, @@ -1522,10 +1550,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; + percpu_rwsem_assert_held(&cpuset_rwsem); + /* * Check all its siblings and call update_cpumasks_hier() * if their use_parent_ecpus flag is set in order for them * to use the right effective_cpus value. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1533,8 +1566,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; if (!sibling->use_parent_ecpus) continue; + if (!css_tryget_online(&sibling->css)) + continue; + rcu_read_unlock(); update_cpumasks_hier(sibling, tmp); + rcu_read_lock(); + css_put(&sibling->css); } rcu_read_unlock(); } @@ -1607,8 +1645,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Make sure that subparts_cpus is a subset of cpus_allowed. */ if (cs->nr_subparts_cpus) { - cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); } spin_unlock_irq(&callback_lock); @@ -1990,7 +2027,7 @@ out: } /* - * update_prstate - update partititon_root_state + * update_prstate - update partition_root_state * cs: the cpuset to update * new_prs: new partition root state * @@ -2252,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2305,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); + cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ @@ -2840,7 +2879,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is * set. This flag handling is implemented in cgroup core for - * histrical reasons - the flag may be specified during mount. + * historical reasons - the flag may be specified during mount. * * Currently, if any sibling cpusets have exclusive cpus or mem, we * refuse to clone the configuration - thereby refusing the task to @@ -3037,7 +3076,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migratecd to an ancestor. + * as the tasks will be migrated to an ancestor. */ if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) update_tasks_cpumask(cs); @@ -3485,8 +3524,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) return cs; } -/** - * cpuset_node_allowed - Can we allocate on a memory node? +/* + * __cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -3657,8 +3696,8 @@ void cpuset_print_current_mems_allowed(void) int cpuset_memory_pressure_enabled __read_mostly; -/** - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. * * Keep a running average of the rate of synchronous (direct) * page reclaim efforts initiated by tasks in each cpuset. @@ -3673,7 +3712,7 @@ int cpuset_memory_pressure_enabled __read_mostly; * "memory_pressure". Value displayed is an integer * representing the recent rate of entry into the synchronous * (direct) page reclaim by any task attached to the cpuset. - **/ + */ void __cpuset_memory_pressure_bump(void) { diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 3984dd6b8ddb..617861a54793 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -1,4 +1,4 @@ -//SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> #include <linux/sched.h> #include <linux/sched/task.h> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 9d331ba44870..24b5c2ab5598 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -153,8 +153,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; + unsigned long flags; - raw_spin_lock(cpu_lock); + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + raw_spin_lock_irqsave(cpu_lock, flags); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { struct cgroup_subsys_state *css; @@ -166,7 +175,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock(cpu_lock); + raw_spin_unlock_irqrestore(cpu_lock, flags); /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || @@ -315,7 +324,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_base_stat cur, delta; + struct cgroup_base_stat delta; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ @@ -325,11 +334,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cur.cputime = rstatc->bstat.cputime; + delta = rstatc->bstat; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); /* propagate percpu delta to global */ - delta = cur; cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index eb0029c9a6a6..e400fbbc8aba 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -1,5 +1,5 @@ # KEEP ALPHABETICALLY SORTED -# CONFIG_AIO is not set +# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_LEGACY_PTYS is not set diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config new file mode 100644 index 000000000000..e8db8d938661 --- /dev/null +++ b/kernel/configs/debug.config @@ -0,0 +1,106 @@ +# The config is based on running daily CI for enterprise Linux distros to +# seek regressions on linux-next builds on different bare-metal and virtual +# platforms. It can be used for example, +# +# $ make ARCH=arm64 defconfig debug.config +# +# Keep alphabetically sorted inside each section. +# +# printk and dmesg options +# +CONFIG_DEBUG_BUGVERBOSE=y +CONFIG_DYNAMIC_DEBUG=y +CONFIG_PRINTK_CALLER=y +CONFIG_PRINTK_TIME=y +CONFIG_SYMBOLIC_ERRNAME=y +# +# Compile-time checks and compiler options +# +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y +CONFIG_DEBUG_SECTION_MISMATCH=y +CONFIG_FRAME_WARN=2048 +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +# +# Generic Kernel Debugging Instruments +# +# CONFIG_UBSAN_ALIGNMENT is not set +# CONFIG_UBSAN_DIV_ZERO is not set +# CONFIG_UBSAN_TRAP is not set +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_FS_ALLOW_ALL=y +CONFIG_DEBUG_IRQFLAGS=y +CONFIG_UBSAN=y +CONFIG_UBSAN_BOOL=y +CONFIG_UBSAN_BOUNDS=y +CONFIG_UBSAN_ENUM=y +CONFIG_UBSAN_SHIFT=y +CONFIG_UBSAN_UNREACHABLE=y +# +# Memory Debugging +# +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_WX is not set +# CONFIG_KFENCE is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_SLUB_STATS is not set +CONFIG_PAGE_EXTENSION=y +CONFIG_PAGE_OWNER=y +CONFIG_DEBUG_KMEMLEAK=y +CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y +CONFIG_DEBUG_OBJECTS=y +CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1 +CONFIG_DEBUG_OBJECTS_FREE=y +CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y +CONFIG_DEBUG_OBJECTS_RCU_HEAD=y +CONFIG_DEBUG_OBJECTS_TIMERS=y +CONFIG_DEBUG_OBJECTS_WORK=y +CONFIG_DEBUG_PER_CPU_MAPS=y +CONFIG_DEBUG_STACK_USAGE=y +CONFIG_DEBUG_VIRTUAL=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_PGFLAGS=y +CONFIG_DEBUG_VM_RB=y +CONFIG_DEBUG_VM_VMACACHE=y +CONFIG_GENERIC_PTDUMP=y +CONFIG_KASAN=y +CONFIG_KASAN_GENERIC=y +CONFIG_KASAN_INLINE=y +CONFIG_KASAN_VMALLOC=y +CONFIG_PTDUMP_DEBUGFS=y +CONFIG_SCHED_STACK_END_CHECK=y +CONFIG_SLUB_DEBUG_ON=y +# +# Debug Oops, Lockups and Hangs +# +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_DEBUG_ATOMIC_SLEEP=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_PANIC_TIMEOUT=0 +CONFIG_SOFTLOCKUP_DETECTOR=y +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +# CONFIG_PROVE_RAW_LOCK_NESTING is not set +CONFIG_PROVE_LOCKING=y +# +# Debug kernel data structures +# +CONFIG_BUG_ON_DATA_CORRUPTION=y +# +# RCU Debugging +# +CONFIG_PROVE_RCU=y +CONFIG_PROVE_RCU_LIST=y +# +# Tracers +# +CONFIG_BRANCH_PROFILE_NONE=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 407a2568f35e..5797c2a7a93f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -34,6 +34,7 @@ #include <linux/scs.h> #include <linux/percpu-rwsem.h> #include <linux/cpuset.h> +#include <linux/random.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -1488,8 +1489,8 @@ int freeze_secondary_cpus(int primary) cpu_maps_update_begin(); if (primary == -1) { primary = cpumask_first(cpu_online_mask); - if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) - primary = housekeeping_any_cpu(HK_FLAG_TIMER); + if (!housekeeping_cpu(primary, HK_TYPE_TIMER)) + primary = housekeeping_any_cpu(HK_TYPE_TIMER); } else { if (!cpu_online(primary)) primary = cpumask_first(cpu_online_mask); @@ -1659,6 +1660,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, + [CPUHP_RANDOM_PREPARE] = { + .name = "random:prepare", + .startup.single = random_prepare_cpu, + .teardown.single = NULL, + }, [CPUHP_WORKQUEUE_PREP] = { .name = "workqueue:prepare", .startup.single = workqueue_prepare_cpu, @@ -1782,6 +1788,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = workqueue_online_cpu, .teardown.single = workqueue_offline_cpu, }, + [CPUHP_AP_RANDOM_ONLINE] = { + .name = "random:online", + .startup.single = random_online_cpu, + .teardown.single = NULL, + }, [CPUHP_AP_RCUTREE_ONLINE] = { .name = "RCU/tree:online", .startup.single = rcutree_online_cpu, diff --git a/kernel/cred.c b/kernel/cred.c index 473d17c431f3..e10c15f51c1f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -665,21 +665,16 @@ EXPORT_SYMBOL(cred_fscmp); int set_cred_ucounts(struct cred *new) { - struct task_struct *task = current; - const struct cred *old = task->real_cred; struct ucounts *new_ucounts, *old_ucounts = new->ucounts; - if (new->user == old->user && new->user_ns == old->user_ns) - return 0; - /* * This optimization is needed because alloc_ucounts() uses locks * for table lookups. */ - if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) + if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid)) return 0; - if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid))) + if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid))) return -EAGAIN; new->ucounts = new_ucounts; @@ -875,7 +870,7 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, /* * report use of invalid credentials */ -void __invalid_creds(const struct cred *cred, const char *file, unsigned line) +void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line) { printk(KERN_ERR "CRED: Invalid credentials\n"); printk(KERN_ERR "CRED: At %s:%u\n", file, line); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 51530d5b15a8..c5e8cea9e05f 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -100,19 +100,10 @@ void __delayacct_blkio_start(void) */ void __delayacct_blkio_end(struct task_struct *p) { - struct task_delay_info *delays = p->delays; - u64 *total; - u32 *count; - - if (p->delays->flags & DELAYACCT_PF_SWAPIN) { - total = &delays->swapin_delay; - count = &delays->swapin_count; - } else { - total = &delays->blkio_delay; - count = &delays->blkio_count; - } - - delayacct_end(&delays->lock, &delays->blkio_start, total, count); + delayacct_end(&p->delays->lock, + &p->delays->blkio_start, + &p->delays->blkio_delay, + &p->delays->blkio_count); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -164,10 +155,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; + tmp = d->compact_delay_total + tsk->delays->compact_delay; + d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; + d->compact_count += tsk->delays->compact_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -179,8 +173,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) unsigned long flags; raw_spin_lock_irqsave(&tsk->delays->lock, flags); - ret = nsec_to_clock_t(tsk->delays->blkio_delay + - tsk->delays->swapin_delay); + ret = nsec_to_clock_t(tsk->delays->blkio_delay); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } @@ -210,3 +203,29 @@ void __delayacct_thrashing_end(void) ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count); } + +void __delayacct_swapin_start(void) +{ + current->delays->swapin_start = local_clock(); +} + +void __delayacct_swapin_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->swapin_start, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); +} + +void __delayacct_compact_start(void) +{ + current->delays->compact_start = local_clock(); +} + +void __delayacct_compact_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->compact_start, + ¤t->delays->compact_delay, + ¤t->delays->compact_count); +} diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 3d63d91cba5c..6ea80ae42622 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = { static int __init rmem_cma_setup(struct reserved_mem *rmem) { - phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); - phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; @@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; - if ((rmem->base & mask) || (rmem->size & mask)) { + if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) { pr_err("Reserved memory: incorrect alignment of CMA region\n"); return -EINVAL; } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9478eccd1c8e..559461a826ba 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -745,7 +745,6 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) { /* @@ -761,7 +760,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) return 0; } EXPORT_SYMBOL(dma_set_coherent_mask); -#endif size_t dma_max_mapping_size(struct device *dev) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 8e840fbbed7c..6c350555e5a1 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -50,6 +50,7 @@ #include <asm/io.h> #include <asm/dma.h> +#include <linux/io.h> #include <linux/init.h> #include <linux/memblock.h> #include <linux/iommu-helper.h> @@ -72,6 +73,8 @@ enum swiotlb_force swiotlb_force; struct io_tlb_mem io_tlb_default_mem; +phys_addr_t swiotlb_unencrypted_base; + /* * Max segment that we can provide which (if pages are contingous) will * not be bounced (unless SWIOTLB_FORCE is set). @@ -156,6 +159,34 @@ static inline unsigned long nr_slots(u64 val) } /* + * Remap swioltb memory in the unencrypted physical address space + * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP + * Isolation VMs). + */ +#ifdef CONFIG_HAS_IOMEM +static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) +{ + void *vaddr = NULL; + + if (swiotlb_unencrypted_base) { + phys_addr_t paddr = mem->start + swiotlb_unencrypted_base; + + vaddr = memremap(paddr, bytes, MEMREMAP_WB); + if (!vaddr) + pr_err("Failed to map the unencrypted memory %pa size %lx.\n", + &paddr, bytes); + } + + return vaddr; +} +#else +static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) +{ + return NULL; +} +#endif + +/* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to * call SWIOTLB when the operations are possible. It needs to be called @@ -172,7 +203,12 @@ void __init swiotlb_update_mem_attributes(void) vaddr = phys_to_virt(mem->start); bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); + + mem->vaddr = swiotlb_mem_remap(mem, bytes); + if (!mem->vaddr) + mem->vaddr = vaddr; + + memset(mem->vaddr, 0, bytes); } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, @@ -196,7 +232,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; } + + /* + * If swiotlb_unencrypted_base is set, the bounce buffer memory will + * be remapped and cleared in swiotlb_update_mem_attributes. + */ + if (swiotlb_unencrypted_base) + return; + memset(vaddr, 0, bytes); + mem->vaddr = vaddr; + return; } int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) @@ -371,7 +417,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); - unsigned char *vaddr = phys_to_virt(tlb_addr); + unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start; unsigned int tlb_offset, orig_addr_offset; if (orig_addr == INVALID_PHYS_ADDR) @@ -581,9 +627,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index bad713684c2e..ed10a95a6b1d 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -3,6 +3,7 @@ #include <linux/context_tracking.h> #include <linux/entry-common.h> #include <linux/highmem.h> +#include <linux/jump_label.h> #include <linux/livepatch.h> #include <linux/audit.h> #include <linux/tick.h> @@ -148,6 +149,18 @@ static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); } +#ifdef CONFIG_RT_DELAYED_SIGNALS +static inline void raise_delayed_signal(void) +{ + if (unlikely(current->forced_info.si_signo)) { + force_sig_info(¤t->forced_info); + current->forced_info.si_signo = 0; + } +} +#else +static inline void raise_delayed_signal(void) { } +#endif + static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) { @@ -162,6 +175,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_NEED_RESCHED) schedule(); + raise_delayed_signal(); + if (ti_work & _TIF_UPROBE) uprobe_notify_resume(regs); @@ -380,7 +395,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } -void irqentry_exit_cond_resched(void) +void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { /* Sanity check RCU and thread stack */ @@ -392,7 +407,17 @@ void irqentry_exit_cond_resched(void) } } #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void) +{ + if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) + return; + raw_irqentry_exit_cond_resched(); +} +#endif #endif noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) @@ -420,13 +445,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) } instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) { -#ifdef CONFIG_PREEMPT_DYNAMIC - static_call(irqentry_exit_cond_resched)(); -#else + if (IS_ENABLED(CONFIG_PREEMPTION)) irqentry_exit_cond_resched(); -#endif - } + /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 58cbe357fb2b..1273be84392c 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, } if (regs) { - mm_segment_t fs; - if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); - force_uaccess_end(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index fc18664f49b0..cfde994ce61c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -674,6 +674,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state) WRITE_ONCE(event->state, state); } +/* + * UP store-release, load-acquire + */ + +#define __store_release(ptr, val) \ +do { \ + barrier(); \ + WRITE_ONCE(*(ptr), (val)); \ +} while (0) + +#define __load_acquire(ptr) \ +({ \ + __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ + barrier(); \ + ___p; \ +}) + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -719,34 +736,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return t->time; } -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { - struct perf_cgroup_info *info; - u64 now; - - now = perf_clock(); + struct perf_cgroup_info *t; - info = this_cpu_ptr(cgrp->info); + t = per_cpu_ptr(event->cgrp->info, event->cpu); + if (!__load_acquire(&t->active)) + return t->time; + now += READ_ONCE(t->timeoffset); + return now; +} - info->time += now - info->timestamp; +static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + if (adv) + info->time += now - info->timestamp; info->timestamp = now; + /* + * see update_context_time() + */ + WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; + struct perf_cgroup_info *info; if (cgrp) { + u64 now = perf_clock(); + for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); - __update_cgrp_time(cgrp); + info = this_cpu_ptr(cgrp->info); + + __update_cgrp_time(info, now, true); + if (final) + __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { + struct perf_cgroup_info *info; struct perf_cgroup *cgrp; /* @@ -760,8 +794,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - __update_cgrp_time(event->cgrp); + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { + info = this_cpu_ptr(event->cgrp->info); + __update_cgrp_time(info, perf_clock(), true); + } } static inline void @@ -785,7 +821,8 @@ perf_cgroup_set_timestamp(struct task_struct *task, for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + __update_cgrp_time(info, ctx->timestamp, false); + __store_release(&info->active, 1); } } @@ -802,7 +839,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); */ static void perf_cgroup_switch(struct task_struct *task, int mode) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; @@ -813,7 +850,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_save(flags); list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); perf_ctx_lock(cpuctx, cpuctx->task_ctx); @@ -982,14 +1019,6 @@ out: } static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} - -static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; @@ -1066,7 +1095,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) { } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, + bool final) { } @@ -1098,12 +1128,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +static inline u64 perf_cgroup_event_time(struct perf_event *event) { + return 0; } -static inline u64 perf_cgroup_event_time(struct perf_event *event) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } @@ -1525,22 +1555,59 @@ static void perf_unpin_context(struct perf_event_context *ctx) /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_event_context *ctx) +static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); - ctx->time += now - ctx->timestamp; + if (adv) + ctx->time += now - ctx->timestamp; ctx->timestamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +} + +static void update_context_time(struct perf_event_context *ctx) +{ + __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + if (unlikely(!ctx)) + return 0; + if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx ? ctx->time : 0; + return ctx->time; +} + +static u64 perf_event_time_now(struct perf_event *event, u64 now) +{ + struct perf_event_context *ctx = event->ctx; + + if (unlikely(!ctx)) + return 0; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time_now(event, now); + + if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) + return ctx->time; + + now += READ_ONCE(ctx->timeoffset); + return now; } static enum event_type_t get_event_type(struct perf_event *event) @@ -2350,7 +2417,7 @@ __perf_remove_from_context(struct perf_event *event, if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, false); } event_sched_out(event, cpuctx, ctx); @@ -2361,6 +2428,9 @@ __perf_remove_from_context(struct perf_event *event, list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { + if (ctx == &cpuctx->ctx) + update_cgrp_time_from_cpuctx(cpuctx, true); + ctx->is_active = 0; ctx->rotate_necessary = 0; if (ctx->task) { @@ -2392,7 +2462,11 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); - if (!ctx->is_active) { + /* + * Cgroup events are per-cpu events, and must IPI because of + * cgrp_cpuctx_list. + */ + if (!ctx->is_active && !is_cgroup_event(event)) { __perf_remove_from_context(event, __get_cpu_context(ctx), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); @@ -2482,40 +2556,6 @@ void perf_event_disable_inatomic(struct perf_event *event) irq_work_queue(&event->pending); } -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, event->tstamp); - else - event->shadow_ctx_time = event->tstamp - ctx->timestamp; -} - #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -2556,8 +2596,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx); - perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { @@ -2861,11 +2899,14 @@ perf_install_in_context(struct perf_event_context *ctx, * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. + * Similarly, cgroup events for the context also needs the IPI to + * manipulate the cgrp_cpuctx_list. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ - if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && + ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); @@ -3197,6 +3238,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, return err; } +/* + * Copy event-type-independent attributes that may be modified. + */ +static void perf_event_modify_copy_attr(struct perf_event_attr *to, + const struct perf_event_attr *from) +{ + to->sig_data = from->sig_data; +} + static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { @@ -3219,10 +3269,17 @@ static int perf_event_modify_attr(struct perf_event *event, WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); + /* + * Event-type-independent attributes must be copied before event-type + * modification, which will validate that final attributes match the + * source attributes after all relevant attributes have been copied. + */ + perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { + perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; @@ -3251,16 +3308,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - - if (ctx->task) { - WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) - cpuctx->task_ctx = NULL; - } - /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last @@ -3274,7 +3321,22 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ @@ -3711,13 +3773,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } +/* + * Because the userpage is strictly per-event (there is no concept of context, + * so there cannot be a context indirection), every userpage must be updated + * when context time starts :-( + * + * IOW, we must not miss EVENT_TIME edges. + */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_update_userpage(event); return true; @@ -3801,13 +3869,23 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task) { int is_active = ctx->is_active; - u64 now; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; + if (is_active ^ EVENT_TIME) { + /* start ctx time */ + __update_context_time(ctx, false); + perf_cgroup_set_timestamp(task, ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) @@ -3818,13 +3896,6 @@ ctx_sched_in(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (is_active & EVENT_TIME) { - /* start ctx time */ - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - } - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -4418,6 +4489,18 @@ static inline u64 perf_event_count(struct perf_event *event) return local64_read(&event->count) + atomic64_read(&event->child_count); } +static void calc_timer_values(struct perf_event *event, + u64 *now, + u64 *enabled, + u64 *running) +{ + u64 ctx_time; + + *now = perf_clock(); + ctx_time = perf_event_time_now(event, *now); + __perf_update_times(event, ctx_time, enabled, running); +} + /* * NMI-safe method to read a local event, that is an event that * is: @@ -4477,10 +4560,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 now = event->shadow_ctx_time + perf_clock(); - u64 __enabled, __running; + u64 __enabled, __running, __now;; - __perf_update_times(event, now, &__enabled, &__running); + calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) @@ -5802,18 +5884,6 @@ static int perf_event_index(struct perf_event *event) return event->pmu->event_idx(event); } -static void calc_timer_values(struct perf_event *event, - u64 *now, - u64 *enabled, - u64 *running) -{ - u64 ctx_time; - - *now = perf_clock(); - ctx_time = event->shadow_ctx_time + *now; - __perf_update_times(event, ctx_time, enabled, running); -} - static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; @@ -5938,6 +6008,8 @@ static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *old_rb = NULL; unsigned long flags; + WARN_ON_ONCE(event->parent); + if (event->rb) { /* * Should be impossible, we set this when removing @@ -5995,6 +6067,9 @@ static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -6008,6 +6083,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -6353,7 +6431,6 @@ accounting: ring_buffer_attach(event, rb); perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { @@ -6669,7 +6746,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, unsigned long sp; unsigned int rem; u64 dyn_size; - mm_segment_t fs; /* * We dump: @@ -6687,9 +6763,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); @@ -6717,7 +6791,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event, if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) goto out; @@ -6783,7 +6857,7 @@ static void perf_aux_sample_output(struct perf_event *event, if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) return; @@ -10454,8 +10528,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ if (state == IF_STATE_END) { ret = -EINVAL; - if (kernel && event->attr.exclude_kernel) - goto fail; /* * ACTION "filter" must have a non-zero length region @@ -10497,8 +10569,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } /* ready to consume more filters */ + kfree(filename); + filename = NULL; state = IF_STATE_ACTION; filter = NULL; + kernel = 0; } } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6357c3580d07..6418083901d4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,11 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = compound_head(old_page), - .vma = vma, - .address = addr, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); int err; struct mmu_notifier_range range; @@ -173,7 +169,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() and munlock_vma_page() below */ + /* For try_to_free_swap() below */ lock_page(old_page); mmu_notifier_invalidate_range_start(&range); @@ -201,13 +197,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); if (!page_mapped(old_page)) try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - - if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) - munlock_vma_page(old_page); put_page(old_page); err = 0; diff --git a/kernel/exit.c b/kernel/exit.c index f702a6a63686..c8ce55541a25 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,6 +64,7 @@ #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> +#include <linux/rethook.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -116,7 +117,7 @@ static void __exit_signal(struct task_struct *tsk) * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) - wake_up_process(sig->group_exit_task); + wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); @@ -169,6 +170,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); + rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); @@ -697,7 +699,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) - wake_up_process(tsk->signal->group_exit_task); + wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { @@ -735,37 +737,8 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - /* - * We can get here from a kernel oops, sometimes with preemption off. - * Start by checking for critical errors. - * Then fix up important state like USER_DS and preemption. - * Then do everything else. - */ - - WARN_ON(blk_needs_flush_plug(tsk)); - - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); - - /* - * If do_exit is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. - */ - force_uaccess_begin(); + WARN_ON(tsk->plug); - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - - profile_task_exit(tsk); kcov_task_exit(tsk); coredump_task_exit(tsk); @@ -773,17 +746,6 @@ void __noreturn do_exit(long code) validate_creds_for_do_exit(tsk); - /* - * We're taking recursive faults here in do_exit. Safest is to just - * leave this task alone and wait for reboot. - */ - if (unlikely(tsk->flags & PF_EXITING)) { - pr_alert("Fixing recursive fault but reboot is needed!\n"); - futex_exit_recursive(tsk); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - } - io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ @@ -871,6 +833,7 @@ void __noreturn do_exit(long code) put_page(tsk->task_frag.page); validate_creds_for_do_exit(tsk); + exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); @@ -882,16 +845,46 @@ void __noreturn do_exit(long code) lockdep_free_task(tsk); do_task_dead(); } -EXPORT_SYMBOL_GPL(do_exit); -void complete_and_exit(struct completion *comp, long code) +void __noreturn make_task_dead(int signr) { - if (comp) - complete(comp); + /* + * Take the task off the cpu after something catastrophic has + * happened. + * + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ + struct task_struct *tsk = current; + + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } - do_exit(code); + /* + * We're taking recursive faults here in make_task_dead. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + pr_alert("Fixing recursive fault but reboot is needed!\n"); + futex_exit_recursive(tsk); + tsk->exit_state = EXIT_DEAD; + refcount_inc(&tsk->rcu_users); + do_task_dead(); + } + + do_exit(signr); } -EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) { @@ -902,22 +895,24 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -void +void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; - BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; @@ -1012,7 +1007,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { - status = p->exit_code; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); diff --git a/kernel/extable.c b/kernel/extable.c index b6f330f0fe74..bda5e9761541 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -3,6 +3,7 @@ Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. */ +#include <linux/elf.h> #include <linux/ftrace.h> #include <linux/memory.h> #include <linux/extable.h> @@ -132,12 +133,33 @@ out: } /* - * On some architectures (PPC64, IA64) function pointers + * On some architectures (PPC64, IA64, PARISC) function pointers * are actually only tokens to some data that then holds the * real function address. As a result, to find if a function * pointer is part of the kernel text, we need to do some * special dereferencing first. */ +#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS +void *dereference_function_descriptor(void *ptr) +{ + func_desc_t *desc = ptr; + void *p; + + if (!get_kernel_nofault(p, (void *)&desc->addr)) + ptr = p; + return ptr; +} +EXPORT_SYMBOL_GPL(dereference_function_descriptor); + +void *dereference_kernel_function_descriptor(void *ptr) +{ + if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd) + return ptr; + + return dereference_function_descriptor(ptr); +} +#endif + int func_ptr_is_kernel_text(void *ptr) { unsigned long addr; diff --git a/kernel/fork.c b/kernel/fork.c index 1c989cc4208a..9796897560ab 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,6 +97,7 @@ #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> +#include <linux/sched/mm.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -185,7 +186,7 @@ static inline void free_task_struct(struct task_struct *tsk) */ # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) -#ifdef CONFIG_VMAP_STACK +# ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. @@ -193,6 +194,41 @@ static inline void free_task_struct(struct task_struct *tsk) #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +struct vm_stack { + struct rcu_head rcu; + struct vm_struct *stack_vm_area; +}; + +static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +{ + unsigned int i; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL) + continue; + return true; + } + return false; +} + +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + + if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) + return; + + vfree(vm_stack); +} + +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct vm_stack *vm_stack = tsk->stack; + + vm_stack->stack_vm_area = tsk->stack_vm_area; + call_rcu(&vm_stack->rcu, thread_stack_free_rcu); +} + static int free_vm_stack_cache(unsigned int cpu) { struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); @@ -210,11 +246,35 @@ static int free_vm_stack_cache(unsigned int cpu) return 0; } -#endif -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) +static int memcg_charge_kernel_stack(struct vm_struct *vm) { -#ifdef CONFIG_VMAP_STACK + int i; + int ret; + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + if (ret) + goto err; + } + return 0; +err: + /* + * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is + * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will + * ignore this page. + */ + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); + return ret; +} + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ + struct vm_struct *vm; void *stack; int i; @@ -226,15 +286,22 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; - /* Mark stack accessible for KASAN. */ + /* Reset stack metadata. */ kasan_unpoison_range(s->addr, THREAD_SIZE); + stack = kasan_reset_tag(s->addr); + /* Clear stale pointers from reused stack. */ - memset(s->addr, 0, THREAD_SIZE); + memset(stack, 0, THREAD_SIZE); + + if (memcg_charge_kernel_stack(s)) { + vfree(s->addr); + return -ENOMEM; + } tsk->stack_vm_area = s; - tsk->stack = s->addr; - return s->addr; + tsk->stack = stack; + return 0; } /* @@ -247,71 +314,96 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); + if (!stack) + return -ENOMEM; + vm = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm)) { + vfree(stack); + return -ENOMEM; + } /* * We can't call find_vm_area() in interrupt context, and * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - if (stack) { - tsk->stack_vm_area = find_vm_area(stack); - tsk->stack = stack; - } - return stack; -#else + tsk->stack_vm_area = vm; + stack = kasan_reset_tag(stack); + tsk->stack = stack; + return 0; +} + +static void free_thread_stack(struct task_struct *tsk) +{ + if (!try_release_thread_stack_to_cache(tsk->stack_vm_area)) + thread_stack_delayed_free(tsk); + + tsk->stack = NULL; + tsk->stack_vm_area = NULL; +} + +# else /* !CONFIG_VMAP_STACK */ + +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER); +} + +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct rcu_head *rh = tsk->stack; + + call_rcu(rh, thread_stack_free_rcu); +} + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); if (likely(page)) { tsk->stack = kasan_reset_tag(page_address(page)); - return tsk->stack; + return 0; } - return NULL; -#endif + return -ENOMEM; } -static inline void free_thread_stack(struct task_struct *tsk) +static void free_thread_stack(struct task_struct *tsk) { -#ifdef CONFIG_VMAP_STACK - struct vm_struct *vm = task_stack_vm_area(tsk); - - if (vm) { - int i; + thread_stack_delayed_free(tsk); + tsk->stack = NULL; +} - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); +# endif /* CONFIG_VMAP_STACK */ +# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ - for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_cmpxchg(cached_stacks[i], - NULL, tsk->stack_vm_area) != NULL) - continue; +static struct kmem_cache *thread_stack_cache; - return; - } +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + kmem_cache_free(thread_stack_cache, rh); +} - vfree_atomic(tsk->stack); - return; - } -#endif +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct rcu_head *rh = tsk->stack; - __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); + call_rcu(rh, thread_stack_free_rcu); } -# else -static struct kmem_cache *thread_stack_cache; -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); stack = kasan_reset_tag(stack); tsk->stack = stack; - return stack; + return stack ? 0 : -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) { - kmem_cache_free(thread_stack_cache, tsk->stack); + thread_stack_delayed_free(tsk); + tsk->stack = NULL; } void thread_stack_cache_init(void) @@ -321,8 +413,26 @@ void thread_stack_cache_init(void) THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } -# endif -#endif + +# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ + unsigned long *stack; + + stack = arch_alloc_thread_stack_node(tsk, node); + tsk->stack = stack; + return stack ? 0 : -ENOMEM; +} + +static void free_thread_stack(struct task_struct *tsk) +{ + arch_free_thread_stack(tsk); + tsk->stack = NULL; +} + +#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -366,63 +476,47 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; - dup_vma_anon_name(orig, new); + dup_anon_vma_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { - free_vma_anon_name(vma); + free_anon_vma_name(vma); kmem_cache_free(vm_area_cachep, vma); } static void account_kernel_stack(struct task_struct *tsk, int account) { - void *stack = task_stack_page(tsk); - struct vm_struct *vm = task_stack_vm_area(tsk); - - if (vm) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + struct vm_struct *vm = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { + void *stack = task_stack_page(tsk); + /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } } -static int memcg_charge_kernel_stack(struct task_struct *tsk) +void exit_task_stack_account(struct task_struct *tsk) { -#ifdef CONFIG_VMAP_STACK - struct vm_struct *vm = task_stack_vm_area(tsk); - int ret; - - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + account_kernel_stack(tsk, -1); - if (vm) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + struct vm_struct *vm; int i; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); - - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - /* - * If memcg_kmem_charge_page() fails, page's - * memory cgroup pointer is NULL, and - * memcg_kmem_uncharge_page() in free_thread_stack() - * will ignore this page. - */ - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, - 0); - if (ret) - return ret; - } + vm = task_stack_vm_area(tsk); + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); } -#endif - return 0; } static void release_task_stack(struct task_struct *tsk) @@ -430,12 +524,7 @@ static void release_task_stack(struct task_struct *tsk) if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) return; /* Better to leak the stack than to free prematurely */ - account_kernel_stack(tsk, -1); free_thread_stack(tsk); - tsk->stack = NULL; -#ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = NULL; -#endif } #ifdef CONFIG_THREAD_INFO_IN_TASK @@ -757,9 +846,7 @@ void __put_task_struct(struct task_struct *tsk) delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); sched_core_free(tsk); - - if (!profile_handoff_task(tsk)) - free_task(tsk); + free_task(tsk); } EXPORT_SYMBOL_GPL(__put_task_struct); @@ -876,8 +963,6 @@ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - unsigned long *stack; - struct vm_struct *stack_vm_area __maybe_unused; int err; if (node == NUMA_NO_NODE) @@ -886,32 +971,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; - stack = alloc_thread_stack_node(tsk, node); - if (!stack) + err = arch_dup_task_struct(tsk, orig); + if (err) goto free_tsk; - if (memcg_charge_kernel_stack(tsk)) - goto free_stack; - - stack_vm_area = task_stack_vm_area(tsk); - - err = arch_dup_task_struct(tsk, orig); + err = alloc_thread_stack_node(tsk, node); + if (err) + goto free_tsk; - /* - * arch_dup_task_struct() clobbers the stack-related fields. Make - * sure they're properly initialized before using any stack-related - * functions again. - */ - tsk->stack = stack; -#ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = stack_vm_area; -#endif #ifdef CONFIG_THREAD_INFO_IN_TASK refcount_set(&tsk->stack_refcount, 1); #endif - - if (err) - goto free_stack; + account_kernel_stack(tsk, 1); err = scs_prepare(tsk, node); if (err) @@ -953,9 +1024,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - tsk->pf_io_worker = NULL; - - account_kernel_stack(tsk, 1); + tsk->worker_private = NULL; kcov_task_init(tsk); kmap_local_fork(tsk); @@ -969,12 +1038,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif +#ifdef CONFIG_IOMMU_SVA + tsk->pasid_activated = 0; +#endif + #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif return tsk; free_stack: + exit_task_stack_account(tsk); free_thread_stack(tsk); free_tsk: free_task_struct(tsk); @@ -1021,13 +1095,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } -static void mm_init_pasid(struct mm_struct *mm) -{ -#ifdef CONFIG_IOMMU_SUPPORT - mm->pasid = INIT_PASID; -#endif -} - static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES @@ -1056,7 +1123,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); - mm_init_pasid(mm); + mm_pasid_init(mm); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); @@ -1123,6 +1190,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + mm_pasid_drop(mm); mmdrop(mm); } @@ -2009,12 +2077,6 @@ static __latent_entropy struct task_struct *copy_process( siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } - /* - * This _must_ happen before we call free_task(), i.e. before we jump - * to any of the bad_fork_* labels. This is to avoid freeing - * p->set_child_tid which is (ab)used as a kthread's data pointer for - * kernel threads (PF_KTHREAD). - */ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? @@ -2029,18 +2091,18 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif + retval = copy_creds(p, clone_flags); + if (retval < 0) + goto bad_fork_free; + retval = -EAGAIN; if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) - goto bad_fork_free; + goto bad_fork_cleanup_count; } current->flags &= ~PF_NPROC_EXCEEDED; - retval = copy_creds(p, clone_flags); - if (retval < 0) - goto bad_fork_free; - /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there @@ -2095,12 +2157,16 @@ static __latent_entropy struct task_struct *copy_process( p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); + if (p->flags & PF_KTHREAD) { + if (!set_kthread_struct(p)) + goto bad_fork_cleanup_delayacct; + } #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_threadgroup_lock; + goto bad_fork_cleanup_delayacct; } #endif #ifdef CONFIG_CPUSETS @@ -2259,6 +2325,9 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_KRETPROBES p->kretprobe_instances.first = NULL; #endif +#ifdef CONFIG_RETHOOK + p->rethooks.first = NULL; +#endif /* * Ensure that the cgroup subsystem policies allow the new process to be @@ -2271,6 +2340,17 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_put_pidfd; /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p, args); + + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by @@ -2327,10 +2407,6 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } - /* past the last point of failure */ - if (pidfile) - fd_install(pidfd, pidfile); - init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2379,8 +2455,11 @@ static __latent_entropy struct task_struct *copy_process( syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + if (pidfile) + fd_install(pidfd, pidfile); + proc_fork_connector(p); - sched_post_fork(p, args); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); @@ -2437,14 +2516,15 @@ bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); -bad_fork_cleanup_threadgroup_lock: #endif +bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); + exit_task_stack_account(p); put_task_stack(p); delayed_free_task(p); fork_out: diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 926c2bb752bc..b22ef1efe751 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -302,7 +302,7 @@ again: * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the page lock (also * cases which we are happy to fail). And we hold a reference, - * so refcount care in invalidate_complete_page's remove_mapping + * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made @@ -1031,7 +1031,7 @@ static void futex_cleanup(struct task_struct *tsk) * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * - * This is called from the recursive fault handling path in do_exit(). + * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 053447183ac5..04f4ebdc3cf5 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,6 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - depends on !CC_IS_CLANG || CLANG_VERSION >= 110000 depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR select CONSTRUCTORS default n diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9888e2bc8c76..52501e5f7655 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,7 +63,9 @@ static struct task_struct *watchdog_task; * Should we dump all CPUs backtraces in a hung task event? * Defaults to 0, can be changed via sysctl. */ -unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ /* @@ -222,11 +224,13 @@ static long hung_timeout_jiffies(unsigned long last_checked, MAX_SCHEDULE_TIMEOUT; } +#ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -241,6 +245,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ +static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); +static struct ctl_table hung_task_sysctls[] = { +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_NEG_ONE, + }, + {} +}; + +static void __init hung_task_sysctl_init(void) +{ + register_sysctl_init("kernel", hung_task_sysctls); +} +#else +#define hung_task_sysctl_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + + static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) @@ -310,6 +384,7 @@ static int __init hung_task_init(void) pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + hung_task_sysctl_init(); return 0; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c09324663088..54af0deb239b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -38,7 +38,7 @@ struct irqaction chained_action = { * @irq: irq number * @chip: pointer to irq chip description structure */ -int irq_set_chip(unsigned int irq, struct irq_chip *chip) +int irq_set_chip(unsigned int irq, const struct irq_chip *chip) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); @@ -46,10 +46,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) if (!desc) return -EINVAL; - if (!chip) - chip = &no_irq_chip; - - desc->irq_data.chip = chip; + desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in @@ -1073,7 +1070,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); void -irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { irq_set_chip(irq, chip); @@ -1558,6 +1555,14 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return 0; } +static struct device *irq_get_parent_device(struct irq_data *data) +{ + if (data->domain) + return data->domain->dev; + + return NULL; +} + /** * irq_chip_pm_get - Enable power for an IRQ chip * @data: Pointer to interrupt specific data @@ -1567,12 +1572,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) */ int irq_chip_pm_get(struct irq_data *data) { + struct device *dev = irq_get_parent_device(data); int retval; - if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { - retval = pm_runtime_get_sync(data->chip->parent_device); + if (IS_ENABLED(CONFIG_PM) && dev) { + retval = pm_runtime_get_sync(dev); if (retval < 0) { - pm_runtime_put_noidle(data->chip->parent_device); + pm_runtime_put_noidle(dev); return retval; } } @@ -1590,10 +1596,11 @@ int irq_chip_pm_get(struct irq_data *data) */ int irq_chip_pm_put(struct irq_data *data) { + struct device *dev = irq_get_parent_device(data); int retval = 0; - if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) - retval = pm_runtime_put(data->chip->parent_device); + if (IS_ENABLED(CONFIG_PM) && dev) + retval = pm_runtime_put(dev); return (retval < 0) ? retval : 0; } diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 39a41c56ad4f..1ed2b1739363 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -176,10 +176,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu) { const struct cpumask *hk_mask; - if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) + if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) return false; - hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask)) return false; diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index e4cff358b437..2b43f5f5033d 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -69,8 +69,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind) seq_printf(m, "chip: None\n"); return; } - seq_printf(m, "%*schip: %s\n", ind, "", chip->name); - seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags); + seq_printf(m, "%*schip: ", ind, ""); + if (chip->irq_print_chip) + chip->irq_print_chip(data, m); + else + seq_printf(m, "%s", chip->name); + seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags); irq_debug_show_bits(m, ind, chip->flags, irqchip_flags, ARRAY_SIZE(irqchip_flags)); } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2267e6527db3..939d21cd55c3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -640,7 +640,7 @@ int handle_irq_desc(struct irq_desc *desc) return -EINVAL; data = irq_desc_get_irq_data(desc); - if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data))) + if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); @@ -662,6 +662,29 @@ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +/** + * generic_handle_irq_safe - Invoke the handler for a particular irq from any + * context. + * @irq: The irq number to handle + * + * Returns: 0 on success, a negative value on error. + * + * This function can be called from any context (IRQ or process context). It + * will report an error if not invoked from IRQ context and the irq has been + * marked to enforce IRQ-context only. + */ +int generic_handle_irq_safe(unsigned int irq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = handle_irq_desc(irq_to_desc(irq)); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(generic_handle_irq_safe); + #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging @@ -676,7 +699,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq); */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { - WARN_ON_ONCE(!in_irq()); + WARN_ON_ONCE(!in_hardirq()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index bf38c546aa25..d5ce96510549 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1319,7 +1319,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); * @chip_data: The associated chip data */ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, + const struct irq_chip *chip, void *chip_data) { struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); @@ -1328,7 +1329,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return -ENOENT; irq_data->hwirq = hwirq; - irq_data->chip = chip ? chip : &no_irq_chip; + irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip); irq_data->chip_data = chip_data; return 0; @@ -1347,7 +1348,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { @@ -1853,7 +1854,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f23ffd30385b..c03f71d5ec10 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -247,13 +247,13 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, * online. */ if (irqd_affinity_is_managed(data) && - housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) { + housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { const struct cpumask *hk_mask, *prog_mask; static DEFINE_RAW_SPINLOCK(tmp_mask_lock); static struct cpumask tmp_mask; - hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); raw_spin_lock(&tmp_mask_lock); cpumask_and(&tmp_mask, mask, hk_mask); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ee595ec09778..623b8136e9af 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -137,7 +137,7 @@ static inline int irq_select_affinity_usr(unsigned int irq) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); + unsigned int irq = (int)(long)pde_data(file_inode(file)); cpumask_var_t new_value; int err; @@ -190,12 +190,12 @@ static ssize_t irq_affinity_list_proc_write(struct file *file, static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_proc_show, pde_data(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_list_proc_show, pde_data(inode)); } static const struct proc_ops irq_affinity_proc_ops = { @@ -265,7 +265,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, PDE_DATA(inode)); + return single_open(file, default_affinity_show, pde_data(inode)); } static const struct proc_ops default_affinity_proc_ops = { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 951c93216fc4..79f2eb617a62 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -212,6 +212,10 @@ unsigned long kallsyms_lookup_name(const char *name) unsigned long i; unsigned int off; + /* Skip the search for empty string. */ + if (!*name) + return 0; + for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); diff --git a/kernel/kcov.c b/kernel/kcov.c index 36ca640c4f8e..475524bd900a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -459,37 +459,28 @@ void kcov_task_exit(struct task_struct *t) static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) { int res = 0; - void *area; struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; unsigned long flags; - area = vmalloc_user(vma->vm_end - vma->vm_start); - if (!area) - return -ENOMEM; - spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); - if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || + if (kcov->area == NULL || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { res = -EINVAL; goto exit; } - if (!kcov->area) { - kcov->area = area; - vma->vm_flags |= VM_DONTEXPAND; - spin_unlock_irqrestore(&kcov->lock, flags); - for (off = 0; off < size; off += PAGE_SIZE) { - page = vmalloc_to_page(kcov->area + off); - if (vm_insert_page(vma, vma->vm_start + off, page)) - WARN_ONCE(1, "vm_insert_page() failed"); - } - return 0; + spin_unlock_irqrestore(&kcov->lock, flags); + vma->vm_flags |= VM_DONTEXPAND; + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); } + return 0; exit: spin_unlock_irqrestore(&kcov->lock, flags); - vfree(area); return res; } @@ -564,31 +555,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { struct task_struct *t; - unsigned long size, unused; + unsigned long flags, unused; int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; - unsigned long flags; switch (cmd) { - case KCOV_INIT_TRACE: - /* - * Enable kcov in trace mode and setup buffer size. - * Must happen before anything else. - */ - if (kcov->mode != KCOV_MODE_DISABLED) - return -EBUSY; - /* - * Size must be at least 2 to hold current position and one PC. - * Later we allocate size * sizeof(unsigned long) memory, - * that must not overflow. - */ - size = arg; - if (size < 2 || size > INT_MAX / sizeof(unsigned long)) - return -EINVAL; - kcov->size = size; - kcov->mode = KCOV_MODE_INIT; - return 0; case KCOV_ENABLE: /* * Enable coverage for the current task. @@ -692,9 +664,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; - unsigned long flags; + unsigned long size, flags; + void *area; - if (cmd == KCOV_REMOTE_ENABLE) { + kcov = filep->private_data; + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + * + * First check the size argument - it must be at least 2 + * to hold the current position and one PC. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + area = vmalloc_user(size * sizeof(unsigned long)); + if (area == NULL) + return -ENOMEM; + spin_lock_irqsave(&kcov->lock, flags); + if (kcov->mode != KCOV_MODE_DISABLED) { + spin_unlock_irqrestore(&kcov->lock, flags); + vfree(area); + return -EBUSY; + } + kcov->area = area; + kcov->size = size; + kcov->mode = KCOV_MODE_INIT; + spin_unlock_irqrestore(&kcov->lock, flags); + return 0; + case KCOV_REMOTE_ENABLE: if (get_user(remote_num_handles, (unsigned __user *)(arg + offsetof(struct kcov_remote_arg, num_handles)))) return -EFAULT; @@ -710,16 +710,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) return -EINVAL; } arg = (unsigned long)remote_arg; + fallthrough; + default: + /* + * All other commands can be normally executed under a spin lock, so we + * obtain and release it here in order to simplify kcov_ioctl_locked(). + */ + spin_lock_irqsave(&kcov->lock, flags); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock_irqrestore(&kcov->lock, flags); + kfree(remote_arg); + return res; } - - kcov = filep->private_data; - spin_lock_irqsave(&kcov->lock, flags); - res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock_irqrestore(&kcov->lock, flags); - - kfree(remote_arg); - - return res; } static const struct file_operations kcov_fops = { diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5a5d192a89ac..68480f731192 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -81,7 +81,7 @@ int kexec_should_crash(struct task_struct *p) if (crash_kexec_post_notifiers) return 0; /* - * There are 4 panic() calls in do_exit() path, each of which + * There are 4 panic() calls in make_task_dead() path, each of which * corresponds to each of these 4 conditions. */ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 21eccc961bba..185badc780b7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -48,6 +48,9 @@ #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) +#if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL) +#define kprobe_sysctls_init() do { } while (0) +#endif static int kprobes_initialized; /* kprobe_table can be accessed by @@ -938,10 +941,10 @@ static void unoptimize_all_kprobes(void) } static DEFINE_MUTEX(kprobe_sysctl_mutex); -int sysctl_kprobes_optimization; -int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos) +static int sysctl_kprobes_optimization; +static int proc_kprobes_optimization_handler(struct ctl_table *table, + int write, void *buffer, + size_t *length, loff_t *ppos) { int ret; @@ -957,6 +960,24 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, return ret; } + +static struct ctl_table kprobe_sysctls[] = { + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static void __init kprobe_sysctls_init(void) +{ + register_sysctl_init("debug", kprobe_sysctls); +} #endif /* CONFIG_SYSCTL */ /* Put a breakpoint for a probe. */ @@ -1468,24 +1489,68 @@ bool within_kprobe_blacklist(unsigned long addr) } /* + * arch_adjust_kprobe_addr - adjust the address + * @addr: symbol base address + * @offset: offset within the symbol + * @on_func_entry: was this @addr+@offset on the function entry + * + * Typically returns @addr + @offset, except for special cases where the + * function might be prefixed by a CFI landing pad, in that case any offset + * inside the landing pad is mapped to the first 'real' instruction of the + * symbol. + * + * Specifically, for things like IBT/BTI, skip the resp. ENDBR/BTI.C + * instruction at +0. + */ +kprobe_opcode_t *__weak arch_adjust_kprobe_addr(unsigned long addr, + unsigned long offset, + bool *on_func_entry) +{ + *on_func_entry = !offset; + return (kprobe_opcode_t *)(addr + offset); +} + +/* * If 'symbol_name' is specified, look it up and add the 'offset' * to it. This way, we can specify a relative address to a symbol. * This returns encoded errors if it fails to look up symbol or invalid * combination of parameters. */ -static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr, - const char *symbol_name, unsigned int offset) +static kprobe_opcode_t * +_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name, + unsigned long offset, bool *on_func_entry) { if ((symbol_name && addr) || (!symbol_name && !addr)) goto invalid; if (symbol_name) { + /* + * Input: @sym + @offset + * Output: @addr + @offset + * + * NOTE: kprobe_lookup_name() does *NOT* fold the offset + * argument into it's output! + */ addr = kprobe_lookup_name(symbol_name, offset); if (!addr) return ERR_PTR(-ENOENT); } - addr = (kprobe_opcode_t *)(((char *)addr) + offset); + /* + * So here we have @addr + @offset, displace it into a new + * @addr' + @offset' where @addr' is the symbol start address. + */ + addr = (void *)addr + offset; + if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset)) + return ERR_PTR(-ENOENT); + addr = (void *)addr - offset; + + /* + * Then ask the architecture to re-combine them, taking care of + * magical function entry details while telling us if this was indeed + * at the start of the function. + */ + addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry); if (addr) return addr; @@ -1495,7 +1560,8 @@ invalid: static kprobe_opcode_t *kprobe_addr(struct kprobe *p) { - return _kprobe_addr(p->addr, p->symbol_name, p->offset); + bool on_func_entry; + return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry); } /* @@ -1541,14 +1607,10 @@ static inline int warn_kprobe_rereg(struct kprobe *p) static int check_ftrace_location(struct kprobe *p) { - unsigned long ftrace_addr; + unsigned long addr = (unsigned long)p->addr; - ftrace_addr = ftrace_location((unsigned long)p->addr); - if (ftrace_addr) { + if (ftrace_location(addr) == addr) { #ifdef CONFIG_KPROBES_ON_FTRACE - /* Given address is not on the instruction boundary */ - if ((unsigned long)p->addr != ftrace_addr) - return -EILSEQ; p->flags |= KPROBE_FLAG_FTRACE; #else /* !CONFIG_KPROBES_ON_FTRACE */ return -EINVAL; @@ -2026,11 +2088,6 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); -bool __weak arch_kprobe_on_func_entry(unsigned long offset) -{ - return !offset; -} - /** * kprobe_on_func_entry() -- check whether given address is function entry * @addr: Target address @@ -2046,15 +2103,13 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset) */ int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) { - kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); + bool on_func_entry; + kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset, &on_func_entry); if (IS_ERR(kp_addr)) return PTR_ERR(kp_addr); - if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset)) - return -ENOENT; - - if (!arch_kprobe_on_func_entry(offset)) + if (!on_func_entry) return -EINVAL; return 0; @@ -2584,6 +2639,7 @@ static int __init init_kprobes(void) err = register_module_notifier(&kprobe_module_nb); kprobes_initialized = (err == 0); + kprobe_sysctls_init(); return err; } early_initcall(init_kprobes); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 35859da8bd4f..b1292a57c2a5 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -24,8 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KERNEL_ATTR_RW(_name) \ -static struct kobj_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) +static struct kobj_attribute _name##_attr = __ATTR_RW(_name) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ed9e7bce9e8..50265f69a135 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,14 +52,16 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + int result; int (*threadfn)(void *); void *data; - mm_segment_t oldfs; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif + /* To store the full name if task comm is truncated. */ + char *full_name; }; enum KTHREAD_BITS { @@ -71,7 +73,7 @@ enum KTHREAD_BITS { static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); - return (__force void *)k->set_child_tid; + return k->worker_private; } /* @@ -79,7 +81,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) * * Per construction; when: * - * (p->flags & PF_KTHREAD) && p->set_child_tid + * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and @@ -87,26 +89,41 @@ static inline struct kthread *to_kthread(struct task_struct *k) */ static inline struct kthread *__to_kthread(struct task_struct *p) { - void *kthread = (__force void *)p->set_child_tid; + void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } -void set_kthread_struct(struct task_struct *p) +void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { - struct kthread *kthread; + struct kthread *kthread = to_kthread(tsk); - if (__to_kthread(p)) + if (!kthread || !kthread->full_name) { + __get_task_comm(buf, buf_size, tsk); return; + } + + strscpy_pad(buf, kthread->full_name, buf_size); +} + +bool set_kthread_struct(struct task_struct *p) +{ + struct kthread *kthread; + + if (WARN_ON_ONCE(to_kthread(p))) + return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); - /* - * We abuse ->set_child_tid to avoid the new member and because it - * can't be wrongly copied by copy_process(). We also rely on fact - * that the caller can't exec, so PF_KTHREAD can't be cleared. - */ - p->set_child_tid = (__force void __user *)kthread; + if (!kthread) + return false; + + init_completion(&kthread->exited); + init_completion(&kthread->parked); + p->vfork_done = &kthread->exited; + + p->worker_private = kthread; + return true; } void free_kthread_struct(struct task_struct *k) @@ -114,13 +131,17 @@ void free_kthread_struct(struct task_struct *k) struct kthread *kthread; /* - * Can be NULL if this kthread was created by kernel_thread() - * or if kmalloc() in kthread() failed. + * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); + if (!kthread) + return; + #ifdef CONFIG_BLK_CGROUP - WARN_ON_ONCE(kthread && kthread->blkcg_css); + WARN_ON_ONCE(kthread->blkcg_css); #endif + k->worker_private = NULL; + kfree(kthread->full_name); kfree(kthread); } @@ -268,6 +289,44 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); +/** + * kthread_exit - Cause the current kthread return @result to kthread_stop(). + * @result: The integer value to return to kthread_stop(). + * + * While kthread_exit can be called directly, it exists so that + * functions which do some additional work in non-modular code such as + * module_put_and_kthread_exit can be implemented. + * + * Does not return. + */ +void __noreturn kthread_exit(long result) +{ + struct kthread *kthread = to_kthread(current); + kthread->result = result; + do_exit(0); +} + +/** + * kthread_complete_and_exit - Exit the current kthread. + * @comp: Completion to complete + * @code: The integer value to return to kthread_stop(). + * + * If present complete @comp and the reuturn code to kthread_stop(). + * + * A kernel thread whose module may be removed after the completion of + * @comp can use this function exit safely. + * + * Does not return. + */ +void __noreturn kthread_complete_and_exit(struct completion *comp, long code) +{ + if (comp) + complete(comp); + + kthread_exit(code); +} +EXPORT_SYMBOL(kthread_complete_and_exit); + static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; @@ -279,34 +338,24 @@ static int kthread(void *_create) struct kthread *self; int ret; - set_kthread_struct(current); self = to_kthread(current); /* If user was SIGKILLed, I release the structure. */ done = xchg(&create->done, NULL); if (!done) { kfree(create); - do_exit(-EINTR); - } - - if (!self) { - create->result = ERR_PTR(-ENOMEM); - complete(done); - do_exit(-ENOMEM); + kthread_exit(-EINTR); } self->threadfn = threadfn; self->data = data; - init_completion(&self->exited); - init_completion(&self->parked); - current->vfork_done = &self->exited; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); + set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); @@ -326,7 +375,7 @@ static int kthread(void *_create) __kthread_parkme(self); ret = threadfn(data); } - do_exit(ret); + kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ @@ -406,12 +455,22 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { char name[TASK_COMM_LEN]; + va_list aq; + int len; /* * task is already visible to other tasks, so updating * COMM must be protected. */ - vsnprintf(name, sizeof(name), namefmt, args); + va_copy(aq, args); + len = vsnprintf(name, sizeof(name), namefmt, aq); + va_end(aq); + if (len >= TASK_COMM_LEN) { + struct kthread *kthread = to_kthread(task); + + /* leave it truncated when out of memory. */ + kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args); + } set_task_comm(task, name); } kfree(create); @@ -628,7 +687,7 @@ EXPORT_SYMBOL_GPL(kthread_park); * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * - * If threadfn() may call do_exit() itself, the caller must ensure + * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() @@ -647,7 +706,7 @@ int kthread_stop(struct task_struct *k) kthread_unpark(k); wake_up_process(k); wait_for_completion(&kthread->exited); - ret = k->exit_code; + ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); @@ -662,7 +721,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD)); + set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; @@ -1381,8 +1440,6 @@ void kthread_use_mm(struct mm_struct *mm) mmdrop(active_mm); else smp_mb(); - - to_kthread(tsk)->oldfs = force_uaccess_begin(); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1397,8 +1454,6 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); - force_uaccess_end(to_kthread(tsk)->oldfs); - task_lock(tsk); /* * When a kthread stops operating on an address space, the loop diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 585494ec464f..bc475e62279d 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -190,7 +190,7 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, +static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, unsigned int symndx, Elf_Shdr *relasec, const char *sec_objname) { @@ -218,7 +218,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { - sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); + sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index fe316c021d73..c172bf92b576 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -124,19 +124,6 @@ unlock: ftrace_test_recursion_unlock(bit); } -/* - * Convert a function address into the appropriate ftrace location. - * - * Usually this is just the address of the function, but on some architectures - * it's more complicated so allow them to provide a custom behaviour. - */ -#ifndef klp_get_ftrace_location -static unsigned long klp_get_ftrace_location(unsigned long faddr) -{ - return faddr; -} -#endif - static void klp_unpatch_func(struct klp_func *func) { struct klp_ops *ops; @@ -153,8 +140,7 @@ static void klp_unpatch_func(struct klp_func *func) if (list_is_singular(&ops->func_stack)) { unsigned long ftrace_loc; - ftrace_loc = - klp_get_ftrace_location((unsigned long)func->old_func); + ftrace_loc = ftrace_location((unsigned long)func->old_func); if (WARN_ON(!ftrace_loc)) return; @@ -186,8 +172,7 @@ static int klp_patch_func(struct klp_func *func) if (!ops) { unsigned long ftrace_loc; - ftrace_loc = - klp_get_ftrace_location((unsigned long)func->old_func); + ftrace_loc = ftrace_location((unsigned long)func->old_func); if (!ftrace_loc) { pr_err("failed to find location for function '%s'\n", func->old_name); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4a882f83aeb9..c06cab6546ed 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -183,11 +183,9 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; unsigned long nr_zapped_classes; -#ifndef CONFIG_DEBUG_LOCKDEP -static -#endif +unsigned long max_lock_class_idx; struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; -static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); +DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); static inline struct lock_class *hlock_class(struct held_lock *hlock) { @@ -338,7 +336,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock) * elements. These elements are linked together by the lock_entry member in * struct lock_class. */ -LIST_HEAD(all_lock_classes); +static LIST_HEAD(all_lock_classes); static LIST_HEAD(free_lock_classes); /** @@ -1252,6 +1250,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; + int idx; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -1317,6 +1316,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * of classes. */ list_move_tail(&class->lock_entry, &all_lock_classes); + idx = class - lock_classes; + if (idx > max_lock_class_idx) + max_lock_class_idx = idx; if (verbose(class)) { graph_unlock(); @@ -3462,7 +3464,7 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) u16 chain_hlock = chain_hlocks[chain->base + i]; unsigned int class_idx = chain_hlock_class_idx(chain_hlock); - return lock_classes + class_idx - 1; + return lock_classes + class_idx; } /* @@ -3530,7 +3532,7 @@ static void print_chain_keys_chain(struct lock_chain *chain) hlock_id = chain_hlocks[chain->base + i]; chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1); + print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id)); printk("\n"); } } @@ -6000,6 +6002,8 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) WRITE_ONCE(class->name, NULL); nr_lock_classes--; __clear_bit(class - lock_classes, lock_classes_in_use); + if (class - lock_classes == max_lock_class_idx) + max_lock_class_idx--; } else { WARN_ONCE(true, "%s() failed for class %s\n", __func__, class->name); @@ -6011,13 +6015,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) static void reinit_class(struct lock_class *class) { - void *const p = class; - const unsigned int offset = offsetof(struct lock_class, key); - WARN_ON_ONCE(!class->lock_entry.next); WARN_ON_ONCE(!list_empty(&class->locks_after)); WARN_ON_ONCE(!list_empty(&class->locks_before)); - memset(p + offset, 0, sizeof(*class) - offset); + memset_startat(class, 0, key); WARN_ON_ONCE(!class->lock_entry.next); WARN_ON_ONCE(!list_empty(&class->locks_after)); WARN_ON_ONCE(!list_empty(&class->locks_before)); @@ -6290,7 +6291,13 @@ void lockdep_reset_lock(struct lockdep_map *lock) lockdep_reset_lock_reg(lock); } -/* Unregister a dynamically allocated key. */ +/* + * Unregister a dynamically allocated key. + * + * Unlike lockdep_register_key(), a search is always done to find a matching + * key irrespective of debug_locks to avoid potential invalid access to freed + * memory in lock_class entry. + */ void lockdep_unregister_key(struct lock_class_key *key) { struct hlist_head *hash_head = keyhashentry(key); @@ -6305,10 +6312,8 @@ void lockdep_unregister_key(struct lock_class_key *key) return; raw_local_irq_save(flags); - if (!graph_lock()) - goto out_irq; + lockdep_lock(); - pf = get_pending_free(); hlist_for_each_entry_rcu(k, hash_head, hash_entry) { if (k == key) { hlist_del_rcu(&k->hash_entry); @@ -6316,11 +6321,13 @@ void lockdep_unregister_key(struct lock_class_key *key) break; } } - WARN_ON_ONCE(!found); - __lockdep_free_key_range(pf, key, 1); - call_rcu_zapped(pf); - graph_unlock(); -out_irq: + WARN_ON_ONCE(!found && debug_locks); + if (found) { + pf = get_pending_free(); + __lockdep_free_key_range(pf, key, 1); + call_rcu_zapped(pf); + } + lockdep_unlock(); raw_local_irq_restore(flags); /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index ecb8662e7a4e..bbe9000260d0 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -121,7 +121,6 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) -extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; #define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1) @@ -151,6 +150,10 @@ extern unsigned int nr_large_chain_blocks; extern unsigned int max_lockdep_depth; extern unsigned int max_bfs_queue_depth; +extern unsigned long max_lock_class_idx; + +extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +extern unsigned long lock_classes_in_use[]; #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); @@ -205,7 +208,6 @@ struct lockdep_stats { }; DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); -extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; #define __debug_atomic_inc(ptr) \ this_cpu_inc(lockdep_stats.ptr); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index b8d9a050c337..15fdc7fa5c68 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -24,14 +24,33 @@ #include "lockdep_internals.h" +/* + * Since iteration of lock_classes is done without holding the lockdep lock, + * it is not safe to iterate all_lock_classes list directly as the iteration + * may branch off to free_lock_classes or the zapped list. Iteration is done + * directly on the lock_classes array by checking the lock_classes_in_use + * bitmap and max_lock_class_idx. + */ +#define iterate_lock_classes(idx, class) \ + for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \ + idx++, class++) + static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &all_lock_classes, pos); + struct lock_class *class = v; + + ++class; + *pos = class - lock_classes; + return (*pos > max_lock_class_idx) ? NULL : class; } static void *l_start(struct seq_file *m, loff_t *pos) { - return seq_list_start_head(&all_lock_classes, *pos); + unsigned long idx = *pos; + + if (idx > max_lock_class_idx) + return NULL; + return lock_classes + idx; } static void l_stop(struct seq_file *m, void *v) @@ -57,14 +76,16 @@ static void print_name(struct seq_file *m, struct lock_class *class) static int l_show(struct seq_file *m, void *v) { - struct lock_class *class = list_entry(v, struct lock_class, lock_entry); + struct lock_class *class = v; struct lock_list *entry; char usage[LOCK_USAGE_CHARS]; + int idx = class - lock_classes; - if (v == &all_lock_classes) { + if (v == lock_classes) seq_printf(m, "all lock classes:\n"); + + if (!test_bit(idx, lock_classes_in_use)) return 0; - } seq_printf(m, "%p", class->key); #ifdef CONFIG_DEBUG_LOCKDEP @@ -220,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING struct lock_class *class; + unsigned long idx; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; if (class->usage_mask == 0) nr_unused++; @@ -254,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) sum_forward_deps += lockdep_count_forward_deps(class); } + #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); #endif @@ -345,6 +370,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " max bfs queue depth: %11u\n", max_bfs_queue_depth); #endif + seq_printf(m, " max lock class index: %11lu\n", + max_lock_class_idx); lockdep_stats_debug_show(m); seq_printf(m, " debug_locks: %11u\n", debug_locks); @@ -622,12 +649,16 @@ static int lock_stat_open(struct inode *inode, struct file *file) if (!res) { struct lock_stat_data *iter = data->stats; struct seq_file *m = file->private_data; + unsigned long idx; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; iter->class = class; iter->stats = lock_stats(class); iter++; } + data->iter_end = iter; sort(data->stats, data->iter_end - data->stats, @@ -645,6 +676,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct lock_class *class; + unsigned long idx; char c; if (count) { @@ -654,8 +686,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf, if (c != '0') return count; - list_for_each_entry(class, &all_lock_classes, lock_entry) + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; clear_lock_stats(class); + } } return count; } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 70a32a576f3f..c9fdae94e098 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -7,6 +7,7 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/task.h> +#include <linux/sched/debug.h> #include <linux/errno.h> int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, @@ -162,7 +163,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) { if (__percpu_down_read_trylock(sem)) return true; @@ -211,7 +212,7 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem) return true; } -void percpu_down_write(struct percpu_rw_semaphore *sem) +void __sched percpu_down_write(struct percpu_rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 69aba4abe104..acde5d6f1254 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1048,7 +1048,7 @@ out_nolock: /* * Wait until we successfully acquire the write lock */ -static struct rw_semaphore * +static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index b562f9289372..7f49baaa4979 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -300,6 +300,16 @@ void __lockfunc _raw_write_lock(rwlock_t *lock) __raw_write_lock(lock); } EXPORT_SYMBOL(_raw_write_lock); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock))) +#endif + +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + __raw_write_lock_nested(lock, subclass); +} +EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 9e396a09fe0f..48a19ed8486d 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -239,6 +239,18 @@ void __sched rt_write_lock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock_nested); +#endif + void __sched rt_read_unlock(rwlock_t *rwlock) { rwlock_release(&rwlock->dep_map, _RET_IP_); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 33783abc377b..8c381c99062f 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -23,9 +23,28 @@ struct load_info { #ifdef CONFIG_KALLSYMS unsigned long mod_kallsyms_init_off; #endif +#ifdef CONFIG_MODULE_DECOMPRESS + struct page **pages; + unsigned int max_pages; + unsigned int used_pages; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; }; extern int mod_verify_sig(const void *mod, struct load_info *info); + +#ifdef CONFIG_MODULE_DECOMPRESS +int module_decompress(struct load_info *info, const void *buf, size_t size); +void module_decompress_cleanup(struct load_info *info); +#else +static inline int module_decompress(struct load_info *info, + const void *buf, size_t size) +{ + return -EOPNOTSUPP; +} +static inline void module_decompress_cleanup(struct load_info *info) +{ +} +#endif diff --git a/kernel/module.c b/kernel/module.c index 24fc305afe95..6cea788fd965 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -335,14 +335,14 @@ static inline void add_taint_module(struct module *mod, unsigned flag, /* * A thread that wants to hold a reference to a module only while it - * is running can call this to safely exit. nfsd and lockd use this. + * is running can call this to safely exit. */ -void __noreturn __module_put_and_exit(struct module *mod, long code) +void __noreturn __module_put_and_kthread_exit(struct module *mod, long code) { module_put(mod); - do_exit(code); + kthread_exit(code); } -EXPORT_SYMBOL(__module_put_and_exit); +EXPORT_SYMBOL(__module_put_and_kthread_exit); /* Find a module section: 0 means not found. */ static unsigned int find_sec(const struct load_info *info, const char *name) @@ -958,7 +958,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, } } - /* Stop the machine so refcounts can't move and disable module. */ ret = try_stop_module(mod, flags, &forced); if (ret != 0) goto out; @@ -2884,12 +2883,13 @@ static int module_sig_check(struct load_info *info, int flags) const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const char *reason; const void *mod = info->hdr; - + bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | + MODULE_INIT_IGNORE_VERMAGIC); /* - * Require flags == 0, as a module with version information - * removed is no longer the module that was signed + * Do not allow mangled modules as a module with version information + * removed is no longer the module that was signed. */ - if (flags == 0 && + if (!mangled_module && info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ @@ -3174,9 +3174,12 @@ out: return err; } -static void free_copy(struct load_info *info) +static void free_copy(struct load_info *info, int flags) { - vfree(info->hdr); + if (flags & MODULE_INIT_COMPRESSED_FILE) + module_decompress_cleanup(info); + else + vfree(info->hdr); } static int rewrite_section_headers(struct load_info *info, int flags) @@ -3722,12 +3725,6 @@ static noinline int do_init_module(struct module *mod) } freeinit->module_init = mod->init_layout.base; - /* - * We want to find out whether @mod uses async during init. Clear - * PF_USED_ASYNC. async_schedule*() will set it. - */ - current->flags &= ~PF_USED_ASYNC; - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3753,22 +3750,13 @@ static noinline int do_init_module(struct module *mod) /* * We need to finish all async code before the module init sequence - * is done. This has potential to deadlock. For example, a newly - * detected block device can trigger request_module() of the - * default iosched from async probing task. Once userland helper - * reaches here, async_synchronize_full() will wait on the async - * task waiting on request_module() and deadlock. - * - * This deadlock is avoided by perfomring async_synchronize_full() - * iff module init queued any async jobs. This isn't a full - * solution as it will deadlock the same if module loading from - * async jobs nests more than once; however, due to the various - * constraints, this hack seems to be the best option for now. - * Please refer to the following thread for details. + * is done. This has potential to deadlock if synchronous module + * loading is requested from async (which is not allowed!). * - * http://thread.gmane.org/gmane.linux.kernel/1420814 + * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous + * request_module() from async workers") for more details. */ - if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) + if (!mod->async_probe_requested) async_synchronize_full(); ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + @@ -4125,7 +4113,7 @@ static int load_module(struct load_info *info, const char __user *uargs, } /* Get rid of temporary copy. */ - free_copy(info); + free_copy(info, flags); /* Done! */ trace_module_load(mod); @@ -4174,7 +4162,7 @@ static int load_module(struct load_info *info, const char __user *uargs, module_deallocate(mod, info); free_copy: - free_copy(info); + free_copy(info, flags); return err; } @@ -4201,7 +4189,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { struct load_info info = { }; - void *hdr = NULL; + void *buf = NULL; + int len; int err; err = may_init_module(); @@ -4211,15 +4200,24 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS - |MODULE_INIT_IGNORE_VERMAGIC)) + |MODULE_INIT_IGNORE_VERMAGIC + |MODULE_INIT_COMPRESSED_FILE)) return -EINVAL; - err = kernel_read_file_from_fd(fd, 0, &hdr, INT_MAX, NULL, + len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, READING_MODULE); - if (err < 0) - return err; - info.hdr = hdr; - info.len = err; + if (len < 0) + return len; + + if (flags & MODULE_INIT_COMPRESSED_FILE) { + err = module_decompress(&info, buf, len); + vfree(buf); /* compressed data is no longer needed */ + if (err) + return err; + } else { + info.hdr = buf; + info.len = len; + } return load_module(&info, uargs, flags); } diff --git a/kernel/module_decompress.c b/kernel/module_decompress.c new file mode 100644 index 000000000000..ffef98a20320 --- /dev/null +++ b/kernel/module_decompress.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2021 Google LLC. + */ + +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/kobject.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <linux/vmalloc.h> + +#include "module-internal.h" + +static int module_extend_max_pages(struct load_info *info, unsigned int extent) +{ + struct page **new_pages; + + new_pages = kvmalloc_array(info->max_pages + extent, + sizeof(info->pages), GFP_KERNEL); + if (!new_pages) + return -ENOMEM; + + memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages)); + kvfree(info->pages); + info->pages = new_pages; + info->max_pages += extent; + + return 0; +} + +static struct page *module_get_next_page(struct load_info *info) +{ + struct page *page; + int error; + + if (info->max_pages == info->used_pages) { + error = module_extend_max_pages(info, info->used_pages); + if (error) + return ERR_PTR(error); + } + + page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!page) + return ERR_PTR(-ENOMEM); + + info->pages[info->used_pages++] = page; + return page; +} + +#ifdef CONFIG_MODULE_COMPRESS_GZIP +#include <linux/zlib.h> +#define MODULE_COMPRESSION gzip +#define MODULE_DECOMPRESS_FN module_gzip_decompress + +/* + * Calculate length of the header which consists of signature, header + * flags, time stamp and operating system ID (10 bytes total), plus + * an optional filename. + */ +static size_t module_gzip_header_len(const u8 *buf, size_t size) +{ + const u8 signature[] = { 0x1f, 0x8b, 0x08 }; + size_t len = 10; + + if (size < len || memcmp(buf, signature, sizeof(signature))) + return 0; + + if (buf[3] & 0x08) { + do { + /* + * If we can't find the end of the file name we must + * be dealing with a corrupted file. + */ + if (len == size) + return 0; + } while (buf[len++] != '\0'); + } + + return len; +} + +static ssize_t module_gzip_decompress(struct load_info *info, + const void *buf, size_t size) +{ + struct z_stream_s s = { 0 }; + size_t new_size = 0; + size_t gzip_hdr_len; + ssize_t retval; + int rc; + + gzip_hdr_len = module_gzip_header_len(buf, size); + if (!gzip_hdr_len) { + pr_err("not a gzip compressed module\n"); + return -EINVAL; + } + + s.next_in = buf + gzip_hdr_len; + s.avail_in = size - gzip_hdr_len; + + s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); + if (!s.workspace) + return -ENOMEM; + + rc = zlib_inflateInit2(&s, -MAX_WBITS); + if (rc != Z_OK) { + pr_err("failed to initialize decompressor: %d\n", rc); + retval = -EINVAL; + goto out; + } + + do { + struct page *page = module_get_next_page(info); + if (!page) { + retval = -ENOMEM; + goto out_inflate_end; + } + + s.next_out = kmap(page); + s.avail_out = PAGE_SIZE; + rc = zlib_inflate(&s, 0); + kunmap(page); + + new_size += PAGE_SIZE - s.avail_out; + } while (rc == Z_OK); + + if (rc != Z_STREAM_END) { + pr_err("decompression failed with status %d\n", rc); + retval = -EINVAL; + goto out_inflate_end; + } + + retval = new_size; + +out_inflate_end: + zlib_inflateEnd(&s); +out: + kfree(s.workspace); + return retval; +} +#elif CONFIG_MODULE_COMPRESS_XZ +#include <linux/xz.h> +#define MODULE_COMPRESSION xz +#define MODULE_DECOMPRESS_FN module_xz_decompress + +static ssize_t module_xz_decompress(struct load_info *info, + const void *buf, size_t size) +{ + static const u8 signature[] = { 0xfd, '7', 'z', 'X', 'Z', 0 }; + struct xz_dec *xz_dec; + struct xz_buf xz_buf; + enum xz_ret xz_ret; + size_t new_size = 0; + ssize_t retval; + + if (size < sizeof(signature) || + memcmp(buf, signature, sizeof(signature))) { + pr_err("not an xz compressed module\n"); + return -EINVAL; + } + + xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1); + if (!xz_dec) + return -ENOMEM; + + xz_buf.in_size = size; + xz_buf.in = buf; + xz_buf.in_pos = 0; + + do { + struct page *page = module_get_next_page(info); + if (!page) { + retval = -ENOMEM; + goto out; + } + + xz_buf.out = kmap(page); + xz_buf.out_pos = 0; + xz_buf.out_size = PAGE_SIZE; + xz_ret = xz_dec_run(xz_dec, &xz_buf); + kunmap(page); + + new_size += xz_buf.out_pos; + } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK); + + if (xz_ret != XZ_STREAM_END) { + pr_err("decompression failed with status %d\n", xz_ret); + retval = -EINVAL; + goto out; + } + + retval = new_size; + + out: + xz_dec_end(xz_dec); + return retval; +} +#else +#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS" +#endif + +int module_decompress(struct load_info *info, const void *buf, size_t size) +{ + unsigned int n_pages; + ssize_t data_size; + int error; + + /* + * Start with number of pages twice as big as needed for + * compressed data. + */ + n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2; + error = module_extend_max_pages(info, n_pages); + + data_size = MODULE_DECOMPRESS_FN(info, buf, size); + if (data_size < 0) { + error = data_size; + goto err; + } + + info->hdr = vmap(info->pages, info->used_pages, VM_MAP, PAGE_KERNEL); + if (!info->hdr) { + error = -ENOMEM; + goto err; + } + + info->len = data_size; + return 0; + +err: + module_decompress_cleanup(info); + return error; +} + +void module_decompress_cleanup(struct load_info *info) +{ + int i; + + if (info->hdr) + vunmap(info->hdr); + + for (i = 0; i < info->used_pages; i++) + __free_page(info->pages[i]); + + kvfree(info->pages); + + info->pages = NULL; + info->max_pages = info->used_pages = 0; +} + +#ifdef CONFIG_SYSFS +static ssize_t compression_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION)); +} +static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); + +static int __init module_decompress_sysfs_init(void) +{ + int error; + + error = sysfs_create_file(&module_kset->kobj, + &module_compression_attr.attr); + if (error) + pr_warn("Failed to create 'compression' attribute"); + + return 0; +} +late_initcall(module_decompress_sysfs_init); +#endif diff --git a/kernel/padata.c b/kernel/padata.c index 18d3a5c699d8..e5819bb8bd1d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -181,7 +181,7 @@ int padata_do_parallel(struct padata_shell *ps, goto out; if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) { - if (!cpumask_weight(pd->cpumask.cbcpu)) + if (cpumask_empty(pd->cpumask.cbcpu)) goto out; /* Select an alternate fallback CPU and notify the caller. */ diff --git a/kernel/panic.c b/kernel/panic.c index cefd7d82366f..eb4dfb932c85 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ #include <linux/bug.h> #include <linux/ratelimit.h> #include <linux/debugfs.h> +#include <trace/events/error_report.h> #include <asm/sections.h> #define PANIC_TIMER_STEP 100 @@ -65,6 +66,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 +#define PANIC_PRINT_ALL_CPU_BT 0x00000040 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -146,10 +148,16 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); -static void panic_print_sys_info(void) +static void panic_print_sys_info(bool console_flush) { - if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) - console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (console_flush) { + if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) + console_flush_on_panic(CONSOLE_REPLAY_ALL); + return; + } + + if (panic_print & PANIC_PRINT_ALL_CPU_BT) + trigger_all_cpu_backtrace(); if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); @@ -184,6 +192,16 @@ void panic(const char *fmt, ...) int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + } + /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since @@ -271,6 +289,8 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + panic_print_sys_info(false); + kmsg_dump(KMSG_DUMP_PANIC); /* @@ -301,7 +321,7 @@ void panic(const char *fmt, ...) debug_locks_off(); console_flush_on_panic(CONSOLE_FLUSH_PENDING); - panic_print_sys_info(); + panic_print_sys_info(true); if (!panic_blink) panic_blink = no_blink; @@ -533,26 +553,9 @@ void oops_enter(void) trigger_all_cpu_backtrace(); } -/* - * 64-bit random ID for oopses: - */ -static u64 oops_id; - -static int init_oops_id(void) -{ - if (!oops_id) - get_random_bytes(&oops_id, sizeof(oops_id)); - else - oops_id++; - - return 0; -} -late_initcall(init_oops_id); - static void print_oops_end_marker(void) { - init_oops_id(); - pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); + pr_warn("---[ end trace %016llx ]---\n", 0ULL); } /* @@ -592,16 +595,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (regs) show_regs(regs); - if (panic_on_warn) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; + if (panic_on_warn) panic("panic_on_warn set ...\n"); - } if (!regs) dump_stack(); @@ -609,6 +604,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, print_irqtrace_events(current); print_oops_end_marker(); + trace_error_report_end(ERROR_DETECTOR_WARN, (unsigned long)caller); /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e6af502c2fd7..938d5c78b421 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,7 +28,6 @@ #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/ctype.h> -#include <linux/genhd.h> #include <linux/ktime.h> #include <linux/security.h> #include <linux/secretmem.h> @@ -689,8 +688,10 @@ static int load_image_and_restore(void) lock_device_hotplug(); error = create_basic_memory_bitmaps(); - if (error) + if (error) { + swsusp_close(FMODE_READ | FMODE_EXCL); goto Unlock; + } error = swsusp_read(&flags); swsusp_close(FMODE_READ | FMODE_EXCL); @@ -1328,7 +1329,7 @@ static int __init resumedelay_setup(char *str) int rc = kstrtouint(str, 0, &resume_delay); if (rc) - return rc; + pr_warn("resumedelay: bad option string '%s'\n", str); return 1; } diff --git a/kernel/power/main.c b/kernel/power/main.c index 44169f3081fd..7e646079fbeb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -504,7 +504,10 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; + if (!pm_wakeup_irq()) + return -ENODATA; + + return sprintf(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); diff --git a/kernel/power/process.c b/kernel/power/process.c index b7e7798637b8..11b570fcf049 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -134,7 +134,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(true); + pm_wakeup_clear(0); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f7a986078213..330d49937692 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -978,8 +978,7 @@ static void memory_bm_recycle(struct memory_bitmap *bm) * Register a range of page frames the contents of which should not be saved * during hibernation (to be used in the early initialization code). */ -void __init __register_nosave_region(unsigned long start_pfn, - unsigned long end_pfn, int use_kmalloc) +void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) { struct nosave_region *region; @@ -995,18 +994,12 @@ void __init __register_nosave_region(unsigned long start_pfn, goto Report; } } - if (use_kmalloc) { - /* During init, this shouldn't fail */ - region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); - BUG_ON(!region); - } else { - /* This allocation cannot fail */ - region = memblock_alloc(sizeof(struct nosave_region), - SMP_CACHE_BYTES); - if (!region) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct nosave_region)); - } + /* This allocation cannot fail */ + region = memblock_alloc(sizeof(struct nosave_region), + SMP_CACHE_BYTES); + if (!region) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 80cc1f0f502b..6fcdee7e87a5 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -136,8 +136,6 @@ static void s2idle_loop(void) break; } - pm_wakeup_clear(false); - s2idle_enter(); } diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index d20526c5be15..b663a97f5867 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value) value++; suspend_type = strsep(&value, ","); if (!suspend_type) - return 0; + return 1; repeat = strsep(&value, ","); if (repeat) { if (kstrtou32(repeat, 0, &test_repeat_count_max)) - return 0; + return 1; } for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (!strcmp(pm_labels[i], suspend_type)) { test_state_label = pm_labels[i]; - return 0; + return 1; } printk(warn_bad_state, suspend_type); - return 0; + return 1; } __setup("test_suspend", setup_test_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ad10359030a4..91fffdd2c7fb 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -16,7 +16,6 @@ #include <linux/file.h> #include <linux/delay.h> #include <linux/bitops.h> -#include <linux/genhd.h> #include <linux/device.h> #include <linux/bio.h> #include <linux/blkdev.h> @@ -89,7 +88,7 @@ struct swap_map_page_list { struct swap_map_page_list *next; }; -/** +/* * The swap_map_handle structure is used for handling swap in * a file-alike way */ @@ -117,7 +116,7 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; -/** +/* * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. */ @@ -171,7 +170,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/** +/* * alloc_swapdev_block - allocate a swap page and register that it has * been allocated, so that it can be freed in case of an error. */ @@ -190,7 +189,7 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/** +/* * free_all_swap_pages - free swap pages allocated for saving image data. * It also frees the extents used to register which swap entries had been * allocated. @@ -277,10 +276,9 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1); + bio = bio_alloc(hib_resume_bdev, 1, op | op_flags, + GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - bio_set_dev(bio, hib_resume_bdev); - bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { pr_err("Adding page to bio failed at %llu\n", diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 105df4dfc783..52571dcad768 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -39,23 +39,20 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) { struct rb_node *node; struct wakelock *wl; - char *str = buf; - char *end = buf + PAGE_SIZE; + int len = 0; mutex_lock(&wakelocks_lock); for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { wl = rb_entry(node, struct wakelock, node); if (wl->ws->active == show_active) - str += scnprintf(str, end - str, "%s ", wl->name); + len += sysfs_emit_at(buf, len, "%s ", wl->name); } - if (str > buf) - str--; - str += scnprintf(str, end - str, "\n"); + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); - return (str - buf); + return len; } #if CONFIG_PM_WAKELOCKS_LIMIT > 0 diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index d118739874c0..f5b388e810b9 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -2,5 +2,8 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o -obj-$(CONFIG_PRINTK) += printk_ringbuffer.o obj-$(CONFIG_PRINTK_INDEX) += index.o + +obj-$(CONFIG_PRINTK) += printk_support.o +printk_support-y := printk_ringbuffer.o +printk_support-$(CONFIG_SYSCTL) += sysctl.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 9f3ed2fdb721..d947ca6c84f9 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -4,6 +4,14 @@ */ #include <linux/percpu.h> +#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) +void __init printk_sysctl_init(void); +int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +#else +#define printk_sysctl_init() do { } while (0) +#endif + #ifdef CONFIG_PRINTK /* Flags for a single printk record. */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 155229f0cf0f..da03c15ecc89 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -93,6 +93,12 @@ EXPORT_SYMBOL_GPL(console_drivers); */ int __read_mostly suppress_printk; +/* + * During panic, heavy printk by other CPUs can delay the + * panic and risk deadlock on console resources. + */ +static int __read_mostly suppress_panic_printk; + #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" @@ -146,8 +152,10 @@ static int __control_devkmsg(char *str) static int __init control_devkmsg(char *str) { - if (__control_devkmsg(str) < 0) + if (__control_devkmsg(str) < 0) { + pr_warn("printk.devkmsg: bad option string '%s'\n", str); return 1; + } /* * Set sysctl string accordingly: @@ -166,12 +174,12 @@ static int __init control_devkmsg(char *str) */ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; - return 0; + return 1; } __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; - +#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -210,6 +218,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return 0; } +#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /* Number of registered extended console drivers. */ static int nr_ext_console_drivers; @@ -256,6 +265,11 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) +static bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -1842,6 +1856,16 @@ static int console_trylock_spinning(void) if (console_trylock()) return 1; + /* + * It's unsafe to spin once a panic has begun. If we are the + * panic CPU, we may have already halted the owner of the + * console_sem. If we are not the panic CPU, then we should + * avoid taking console_sem, so the panic CPU has a better + * chance of cleanly acquiring it later. + */ + if (panic_in_progress()) + return 0; + printk_safe_enter_irqsave(flags); raw_spin_lock(&console_owner_lock); @@ -2217,6 +2241,10 @@ asmlinkage int vprintk_emit(int facility, int level, if (unlikely(suppress_printk)) return 0; + if (unlikely(suppress_panic_printk) && + atomic_read(&panic_cpu) != raw_smp_processor_id()) + return 0; + if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; @@ -2323,6 +2351,20 @@ asmlinkage __visible void early_printk(const char *fmt, ...) } #endif +static void set_user_specified(struct console_cmdline *c, bool user_specified) +{ + if (!user_specified) + return; + + /* + * @c console was defined by the user on the command line. + * Do not clear when added twice also by SPCR or the device tree. + */ + c->user_specified = true; + /* At least one console defined by the user on the command line. */ + console_set_on_cmdline = 1; +} + static int __add_preferred_console(char *name, int idx, char *options, char *brl_options, bool user_specified) { @@ -2339,8 +2381,7 @@ static int __add_preferred_console(char *name, int idx, char *options, if (strcmp(c->name, name) == 0 && c->index == idx) { if (!brl_options) preferred_console = i; - if (user_specified) - c->user_specified = true; + set_user_specified(c, user_specified); return 0; } } @@ -2350,7 +2391,7 @@ static int __add_preferred_console(char *name, int idx, char *options, preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; - c->user_specified = user_specified; + set_user_specified(c, user_specified); braille_set_options(c, brl_options); c->index = idx; @@ -2416,7 +2457,6 @@ static int __init console_setup(char *str) *s = 0; __add_preferred_console(buf, idx, options, brl_options, true); - console_set_on_cmdline = 1; return 1; } __setup("console=", console_setup); @@ -2573,6 +2613,25 @@ static int have_callable_console(void) } /* + * Return true when this CPU should unlock console_sem without pushing all + * messages to the console. This reduces the chance that the console is + * locked when the panic CPU tries to use it. + */ +static bool abandon_console_lock_in_panic(void) +{ + if (!panic_in_progress()) + return false; + + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return atomic_read(&panic_cpu) != raw_smp_processor_id(); +} + +/* * Can we actually use the console at this time on this cpu? * * Console drivers may assume that per-cpu resources have been allocated. So @@ -2602,6 +2661,7 @@ void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[CONSOLE_LOG_MAX]; + static int panic_console_dropped; unsigned long flags; bool do_cond_resched, retry; struct printk_info info; @@ -2656,6 +2716,10 @@ skip: if (console_seq != r.info->seq) { console_dropped += r.info->seq - console_seq; console_seq = r.info->seq; + if (panic_in_progress() && panic_console_dropped++ > 10) { + suppress_panic_printk = 1; + pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); + } } if (suppress_message_printing(r.info->level)) { @@ -2715,6 +2779,10 @@ skip: if (handover) return; + /* Allow panic_cpu to take over the consoles safely */ + if (abandon_console_lock_in_panic()) + break; + if (do_cond_resched) cond_resched(); } @@ -2732,7 +2800,7 @@ skip: * flush, no worries. */ retry = prb_read_valid(prb, next_seq, NULL); - if (retry && console_trylock()) + if (retry && !abandon_console_lock_in_panic() && console_trylock()) goto again; } EXPORT_SYMBOL(console_unlock); @@ -3211,6 +3279,7 @@ static int __init printk_late_init(void) ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); + printk_sysctl_init(); return 0; } late_initcall(printk_late_init); @@ -3226,7 +3295,7 @@ static DEFINE_PER_CPU(int, printk_pending); static void wake_up_klogd_work_func(struct irq_work *irq_work) { - int pending = __this_cpu_xchg(printk_pending, 0); + int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { /* If trylock fails, someone else is doing the printing */ @@ -3260,7 +3329,7 @@ void defer_console_output(void) return; preempt_disable(); - __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); + this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); } diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 8a7b7362c0dd..2b7b6ddab4f7 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -474,8 +474,10 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, * state has been re-checked. A memcpy() for all of @desc * cannot be used because of the atomic_t @state_var field. */ - memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, - sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + if (desc_out) { + memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, + sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + } if (seq_out) *seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) @@ -528,7 +530,8 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ d_state = get_desc_state(id, state_val); out: - atomic_long_set(&desc_out->state_var, state_val); + if (desc_out) + atomic_long_set(&desc_out->state_var, state_val); return d_state; } @@ -1449,6 +1452,9 @@ static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ + + /* Best effort to remember the last finalized @id. */ + atomic_long_set(&desc_ring->last_finalized_id, id); } /** @@ -1657,7 +1663,12 @@ void prb_commit(struct prb_reserved_entry *e) */ void prb_final_commit(struct prb_reserved_entry *e) { + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + _prb_commit(e, desc_finalized); + + /* Best effort to remember the last finalized @id. */ + atomic_long_set(&desc_ring->last_finalized_id, e->id); } /* @@ -2005,9 +2016,39 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) */ u64 prb_next_seq(struct printk_ringbuffer *rb) { - u64 seq = 0; + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + unsigned long id; + u64 seq; + + /* Check if the cached @id still points to a valid @seq. */ + id = atomic_long_read(&desc_ring->last_finalized_id); + d_state = desc_read(desc_ring, id, NULL, &seq, NULL); - /* Search forward from the oldest descriptor. */ + if (d_state == desc_finalized || d_state == desc_reusable) { + /* + * Begin searching after the last finalized record. + * + * On 0, the search must begin at 0 because of hack#2 + * of the bootstrapping phase it is not known if a + * record at index 0 exists. + */ + if (seq != 0) + seq++; + } else { + /* + * The information about the last finalized sequence number + * has gone. It should happen only when there is a flood of + * new messages and the ringbuffer is rapidly recycled. + * Give up and start from the beginning. + */ + seq = 0; + } + + /* + * The information about the last finalized @seq might be inaccurate. + * Search forward to find the current one. + */ while (_prb_read_valid(rb, &seq, NULL, NULL)) seq++; @@ -2044,6 +2085,7 @@ void prb_init(struct printk_ringbuffer *rb, rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); + atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits)); rb->text_data_ring.size_bits = textbits; rb->text_data_ring.data = text_buf; diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 73cc80e01cef..18cd25e489b8 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -75,6 +75,7 @@ struct prb_desc_ring { struct printk_info *infos; atomic_long_t head_id; atomic_long_t tail_id; + atomic_long_t last_finalized_id; }; /* @@ -258,6 +259,7 @@ static struct printk_ringbuffer name = { \ .infos = &_##name##_infos[0], \ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \ }, \ .text_data_ring = { \ .size_bits = (avgtextbits) + (descbits), \ diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c new file mode 100644 index 000000000000..c228343eeb97 --- /dev/null +++ b/kernel/printk/sysctl.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sysctl.c: General linux system control interface + */ + +#include <linux/sysctl.h> +#include <linux/printk.h> +#include <linux/capability.h> +#include <linux/ratelimit.h> +#include "internal.h" + +static const int ten_thousand = 10000; + +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} + +static struct ctl_table printk_sysctls[] = { + { + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_ratelimit", + .data = &printk_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&ten_thousand, + }, + { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + {} +}; + +void __init printk_sysctl_init(void) +{ + register_sysctl_init("kernel", printk_sysctls); +} diff --git a/kernel/profile.c b/kernel/profile.c index eb9c7f0f5ac5..37640a0bd8a3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -133,79 +133,6 @@ int __ref profile_init(void) return -ENOMEM; } -/* Profile event notifications */ - -static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); -static ATOMIC_NOTIFIER_HEAD(task_free_notifier); -static BLOCKING_NOTIFIER_HEAD(munmap_notifier); - -void profile_task_exit(struct task_struct *task) -{ - blocking_notifier_call_chain(&task_exit_notifier, 0, task); -} - -int profile_handoff_task(struct task_struct *task) -{ - int ret; - ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); - return (ret == NOTIFY_OK) ? 1 : 0; -} - -void profile_munmap(unsigned long addr) -{ - blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); -} - -int task_handoff_register(struct notifier_block *n) -{ - return atomic_notifier_chain_register(&task_free_notifier, n); -} -EXPORT_SYMBOL_GPL(task_handoff_register); - -int task_handoff_unregister(struct notifier_block *n) -{ - return atomic_notifier_chain_unregister(&task_free_notifier, n); -} -EXPORT_SYMBOL_GPL(task_handoff_unregister); - -int profile_event_register(enum profile_type type, struct notifier_block *n) -{ - int err = -EINVAL; - - switch (type) { - case PROFILE_TASK_EXIT: - err = blocking_notifier_chain_register( - &task_exit_notifier, n); - break; - case PROFILE_MUNMAP: - err = blocking_notifier_chain_register( - &munmap_notifier, n); - break; - } - - return err; -} -EXPORT_SYMBOL_GPL(profile_event_register); - -int profile_event_unregister(enum profile_type type, struct notifier_block *n) -{ - int err = -EINVAL; - - switch (type) { - case PROFILE_TASK_EXIT: - err = blocking_notifier_chain_unregister( - &task_exit_notifier, n); - break; - case PROFILE_MUNMAP: - err = blocking_notifier_chain_unregister( - &munmap_notifier, n); - break; - } - - return err; -} -EXPORT_SYMBOL_GPL(profile_event_unregister); - #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f8589bf8d7dc..eea265082e97 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -419,8 +419,6 @@ static int ptrace_attach(struct task_struct *task, long request, if (task->ptrace) goto unlock_tasklist; - if (seize) - flags |= PT_SEIZED; task->ptrace = flags; ptrace_link(task, current); diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index e373fbe44da5..431cee212467 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -56,13 +56,13 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp, int flags) { - rsclp->flags |= flags; + WRITE_ONCE(rsclp->flags, rsclp->flags | flags); } static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp, int flags) { - rsclp->flags &= ~flags; + WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags); } static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 422f7e4cc08d..55d049c39608 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -284,7 +284,7 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); -static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ +static atomic_t rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ /* * Allocate an element from the rcu_tortures pool. @@ -387,7 +387,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!READ_ONCE(rcu_fwd_cb_nodelay) && + if (!atomic_read(&rcu_fwd_cb_nodelay) && !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); @@ -674,6 +674,7 @@ static struct rcu_torture_ops srcu_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcu" @@ -708,6 +709,7 @@ static struct rcu_torture_ops srcud_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcud" @@ -997,7 +999,7 @@ static int rcu_torture_boost(void *arg) goto checkwait; /* Wait for the next test interval. */ - oldstarttime = boost_starttime; + oldstarttime = READ_ONCE(boost_starttime); while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); if (stutter_wait("rcu_torture_boost")) @@ -1041,10 +1043,11 @@ static int rcu_torture_boost(void *arg) * interval. Besides, we are running at RT priority, * so delays should be relatively rare. */ - while (oldstarttime == boost_starttime && !kthread_should_stop()) { + while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) { if (mutex_trylock(&boost_mutex)) { if (oldstarttime == boost_starttime) { - boost_starttime = jiffies + test_boost_interval * HZ; + WRITE_ONCE(boost_starttime, + jiffies + test_boost_interval * HZ); n_rcu_torture_boosts++; } mutex_unlock(&boost_mutex); @@ -1276,7 +1279,7 @@ rcu_torture_writer(void *arg) boot_ended = rcu_inkernel_boot_has_ended(); stutter_waited = stutter_wait("rcu_torture_writer"); if (stutter_waited && - !READ_ONCE(rcu_fwd_cb_nodelay) && + !atomic_read(&rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && boot_ended) @@ -2180,7 +2183,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) if (rfp->n_launders_hist[i].n_launders > 0) break; - mutex_lock(&rcu_fwd_mutex); // Serialize histograms. pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):", __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat); gps_old = rfp->rcu_launder_gp_seq_start; @@ -2193,7 +2195,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) gps_old = gps; } pr_cont("\n"); - mutex_unlock(&rcu_fwd_mutex); } /* Callback function for continuous-flood RCU callbacks. */ @@ -2281,6 +2282,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, unsigned long stopat; static DEFINE_TORTURE_RANDOM(trs); + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); if (!cur_ops->sync) return; // Cannot do need_resched() forward progress testing without ->sync. if (cur_ops->call && cur_ops->cb_barrier) { @@ -2289,7 +2291,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, } /* Tight loop containing cond_resched(). */ - WRITE_ONCE(rcu_fwd_cb_nodelay, true); + atomic_inc(&rcu_fwd_cb_nodelay); cur_ops->sync(); /* Later readers see above write. */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); @@ -2325,6 +2327,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, if (selfpropcb) { WRITE_ONCE(fcs.stop, 1); cur_ops->sync(); /* Wait for running CB to complete. */ + pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id); cur_ops->cb_barrier(); /* Wait for queued callbacks. */ } @@ -2333,7 +2336,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, destroy_rcu_head_on_stack(&fcs.rh); } schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */ - WRITE_ONCE(rcu_fwd_cb_nodelay, false); + atomic_dec(&rcu_fwd_cb_nodelay); } /* Carry out call_rcu() forward-progress testing. */ @@ -2353,13 +2356,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) unsigned long stopat; unsigned long stoppedat; + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); if (READ_ONCE(rcu_fwd_emergency_stop)) return; /* Get out of the way quickly, no GP wait! */ if (!cur_ops->call) return; /* Can't do call_rcu() fwd prog without ->call. */ /* Loop continuously posting RCU callbacks. */ - WRITE_ONCE(rcu_fwd_cb_nodelay, true); + atomic_inc(&rcu_fwd_cb_nodelay); cur_ops->sync(); /* Later readers see above write. */ WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES; @@ -2414,6 +2418,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb); cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(rfp); @@ -2427,11 +2432,13 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs); + mutex_lock(&rcu_fwd_mutex); // Serialize histograms. rcu_torture_fwd_cb_hist(rfp); + mutex_unlock(&rcu_fwd_mutex); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ tick_dep_clear_task(current, TICK_DEP_BIT_RCU); - WRITE_ONCE(rcu_fwd_cb_nodelay, false); + atomic_dec(&rcu_fwd_cb_nodelay); } @@ -2511,7 +2518,7 @@ static int rcu_torture_fwd_prog(void *args) firsttime = false; WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); } else { - while (READ_ONCE(rcu_fwd_seq) == oldseq) + while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop()) schedule_timeout_interruptible(1); oldseq = READ_ONCE(rcu_fwd_seq); } @@ -2905,8 +2912,10 @@ rcu_torture_cleanup(void) int i; if (torture_cleanup_begin()) { - if (cur_ops->cb_barrier != NULL) + if (cur_ops->cb_barrier != NULL) { + pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); + } return; } if (!cur_ops) { @@ -2961,8 +2970,10 @@ rcu_torture_cleanup(void) * Wait for all RCU callbacks to fire, then do torture-type-specific * cleanup operations. */ - if (cur_ops->cb_barrier != NULL) + if (cur_ops->cb_barrier != NULL) { + pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); + } if (cur_ops->cleanup != NULL) cur_ops->cleanup(); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 84f1d91604cc..99cf3a13954c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -123,7 +123,7 @@ static struct rcu_tasks rt_name = \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ .name = n, \ - .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \ + .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \ .percpu_enqueue_lim = 1, \ .percpu_dequeue_lim = 1, \ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \ @@ -216,6 +216,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int cpu; unsigned long flags; int lim; + int shift; raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rcu_task_enqueue_lim < 0) { @@ -229,7 +230,10 @@ static void cblist_init_generic(struct rcu_tasks *rtp) if (lim > nr_cpu_ids) lim = nr_cpu_ids; - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids / lim)); + shift = ilog2(nr_cpu_ids / lim); + if (((nr_cpu_ids - 1) >> shift) >= lim) + shift++; + WRITE_ONCE(rtp->percpu_enqueue_shift, shift); WRITE_ONCE(rtp->percpu_dequeue_lim, lim); smp_store_release(&rtp->percpu_enqueue_lim, lim); for_each_possible_cpu(cpu) { @@ -298,7 +302,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim != nr_cpu_ids) { - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, 0); WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); @@ -413,7 +417,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim > 1) { - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); @@ -492,7 +496,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_FLAG_RCU); + housekeeping_affine(current, HK_TYPE_RCU); WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4c25a6283b0..a4b8189455d5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -87,11 +87,12 @@ static struct rcu_state rcu_state = { .gp_state = RCU_GP_IDLE, .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex), + .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock), .name = RCU_NAME, .abbr = RCU_ABBR, .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), - .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock), + .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -153,7 +154,7 @@ static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); -/* rcuc/rcub kthread realtime priority */ +/* rcuc/rcub/rcuop kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; module_param(kthread_prio, int, 0444); @@ -222,6 +223,16 @@ static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) } /* + * Is the CPU corresponding to the specified rcu_data structure online + * from RCU's perspective? This perspective is given by that structure's + * ->qsmaskinitnext field rather than by the global cpu_online_mask. + */ +static bool rcu_rdp_cpu_online(struct rcu_data *rdp) +{ + return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); +} + +/* * Return true if an RCU grace period is in progress. The READ_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. @@ -1086,9 +1097,8 @@ void rcu_irq_enter_irqson(void) * Just check whether or not this CPU has non-offloaded RCU callbacks * queued. */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) +int rcu_needs_cpu(void) { - *nextevt = KTIME_MAX; return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); } @@ -1167,15 +1177,20 @@ void rcu_request_urgent_qs_task(struct task_struct *t) bool rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; - struct rcu_node *rnp; bool ret = false; if (in_nmi() || !rcu_scheduler_fully_active) return true; preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); - rnp = rdp->mynode; - if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1) + /* + * Strictly, we care here about the case where the current CPU is + * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask + * not being up to date. So arch_spin_is_locked() might have a + * false positive if it's held by some *other* CPU, but that's + * OK because that just means a false *negative* on the warning. + */ + if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) ret = true; preempt_enable_notrace(); return ret; @@ -1260,8 +1275,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * For more detail, please refer to the "Hotplug CPU" section * of RCU's Requirements documentation. */ - if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) { - bool onl; + if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) { struct rcu_node *rnp1; pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", @@ -1270,9 +1284,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); - onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", - __func__, rdp->cpu, ".o"[onl], + __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)], (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); return 1; /* Break things loose after complaining. */ @@ -1739,7 +1752,6 @@ static void rcu_strict_gp_boundary(void *unused) */ static noinline_for_stack bool rcu_gp_init(void) { - unsigned long firstseq; unsigned long flags; unsigned long oldmask; unsigned long mask; @@ -1782,22 +1794,17 @@ static noinline_for_stack bool rcu_gp_init(void) * of RCU's Requirements documentation. */ WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); + /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { - // Wait for CPU-hotplug operations that might have - // started before this grace period did. - smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. - firstseq = READ_ONCE(rnp->ofl_seq); - if (firstseq & 0x1) - while (firstseq == READ_ONCE(rnp->ofl_seq)) - schedule_timeout_idle(1); // Can't wake unless RCU is watching. - smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values. - raw_spin_lock(&rcu_state.ofl_lock); - raw_spin_lock_irq_rcu_node(rnp); + local_irq_save(flags); + arch_spin_lock(&rcu_state.ofl_lock); + raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ - raw_spin_unlock_irq_rcu_node(rnp); - raw_spin_unlock(&rcu_state.ofl_lock); + raw_spin_unlock_rcu_node(rnp); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(flags); continue; } @@ -1832,8 +1839,9 @@ static noinline_for_stack bool rcu_gp_init(void) rcu_cleanup_dead_rnp(rnp); } - raw_spin_unlock_irq_rcu_node(rnp); - raw_spin_unlock(&rcu_state.ofl_lock); + raw_spin_unlock_rcu_node(rnp); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(flags); } rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ @@ -2850,10 +2858,12 @@ static void rcu_cpu_kthread(unsigned int cpu) { unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); + unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity); int spincnt; trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); for (spincnt = 0; spincnt < 10; spincnt++) { + WRITE_ONCE(*j, jiffies); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); @@ -2874,6 +2884,7 @@ static void rcu_cpu_kthread(unsigned int cpu) schedule_timeout_idle(2); trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; + WRITE_ONCE(*j, jiffies); } static struct smp_hotplug_thread rcu_cpu_thread_spec = { @@ -2894,7 +2905,7 @@ static int __init rcu_spawn_core_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; - if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq) + if (use_softirq) return 0; WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); @@ -2995,9 +3006,47 @@ static void check_cb_ovld(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); } -/* Helper function for call_rcu() and friends. */ -static void -__call_rcu(struct rcu_head *head, rcu_callback_t func) +/** + * call_rcu() - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. + * + * RCU read-side critical sections are delimited by rcu_read_lock() + * and rcu_read_unlock(), and may be nested. In addition, but only in + * v5.0 and later, regions of code across which interrupts, preemption, + * or softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). + * + * Implementation of these memory-ordering guarantees is described here: + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + */ +void call_rcu(struct rcu_head *head, rcu_callback_t func) { static atomic_t doublefrees; unsigned long flags; @@ -3011,7 +3060,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) /* * Probable double call_rcu(), so leak the callback. * Use rcu:rcu_callback trace event to find the previous - * time callback was passed to __call_rcu(). + * time callback was passed to call_rcu(). */ if (atomic_inc_return(&doublefrees) < 4) { pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); @@ -3022,8 +3071,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) } head->func = func; head->next = NULL; - local_irq_save(flags); kasan_record_aux_stack_noalloc(head); + local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ @@ -3060,51 +3109,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) local_irq_restore(flags); } } - -/** - * call_rcu() - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all pre-existing RCU read-side - * critical sections have completed. However, the callback function - * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. - * - * RCU read-side critical sections are delimited by rcu_read_lock() - * and rcu_read_unlock(), and may be nested. In addition, but only in - * v5.0 and later, regions of code across which interrupts, preemption, - * or softirqs have been disabled also serve as RCU read-side critical - * sections. This includes hardware interrupt handlers, softirq handlers, - * and NMI handlers. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing RCU read-side critical section. On systems with more - * than one CPU, this means that when "func()" is invoked, each CPU is - * guaranteed to have executed a full memory barrier since the end of its - * last RCU read-side critical section whose beginning preceded the call - * to call_rcu(). It also means that each CPU executing an RCU read-side - * critical section that continues beyond the start of "func()" must have - * executed a memory barrier after the call_rcu() but before the beginning - * of that RCU read-side critical section. Note that these guarantees - * include CPUs that are offline, idle, or executing in user mode, as - * well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting RCU callback function "func()", then both CPU A and CPU B are - * guaranteed to execute a full memory barrier during the time interval - * between the call to call_rcu() and the invocation of "func()" -- even - * if CPU A and CPU B are the same CPU (but again only if the system has - * more than one CPU). - * - * Implementation of these memory-ordering guarantees is described here: - * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. - */ -void call_rcu(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func); -} EXPORT_SYMBOL_GPL(call_rcu); @@ -3984,13 +3988,16 @@ static void rcu_barrier_callback(struct rcu_head *rhp) } /* - * Called with preemption disabled, and from cross-cpu IRQ context. + * If needed, entrain an rcu_barrier() callback on rdp->cblist. */ -static void rcu_barrier_func(void *cpu_in) +static void rcu_barrier_entrain(struct rcu_data *rdp) { - uintptr_t cpu = (uintptr_t)cpu_in; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence); + unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap); + lockdep_assert_held(&rcu_state.barrier_lock); + if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq)) + return; rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); @@ -4000,10 +4007,26 @@ static void rcu_barrier_func(void *cpu_in) atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - rcu_barrier_trace(TPS("IRQNQ"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence); } rcu_nocb_unlock(rdp); + smp_store_release(&rdp->barrier_seq_snap, gseq); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_handler(void *cpu_in) +{ + uintptr_t cpu = (uintptr_t)cpu_in; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + lockdep_assert_irqs_disabled(); + WARN_ON_ONCE(cpu != rdp->cpu); + WARN_ON_ONCE(cpu != smp_processor_id()); + raw_spin_lock(&rcu_state.barrier_lock); + rcu_barrier_entrain(rdp); + raw_spin_unlock(&rcu_state.barrier_lock); } /** @@ -4017,6 +4040,8 @@ static void rcu_barrier_func(void *cpu_in) void rcu_barrier(void) { uintptr_t cpu; + unsigned long flags; + unsigned long gseq; struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); @@ -4027,15 +4052,16 @@ void rcu_barrier(void) /* Did someone else do our work for us? */ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { - rcu_barrier_trace(TPS("EarlyExit"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rcu_state.barrier_mutex); return; } /* Mark the start of the barrier operation. */ + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); rcu_seq_start(&rcu_state.barrier_sequence); + gseq = rcu_state.barrier_sequence; rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); /* @@ -4047,7 +4073,7 @@ void rcu_barrier(void) */ init_completion(&rcu_state.barrier_completion); atomic_set(&rcu_state.barrier_cpu_count, 2); - cpus_read_lock(); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); /* * Force each CPU with callbacks to register a new callback. @@ -4056,29 +4082,31 @@ void rcu_barrier(void) */ for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (cpu_is_offline(cpu) && - !rcu_rdp_is_offloaded(rdp)) +retry: + if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq) continue; - if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { - rcu_barrier_trace(TPS("OnlineQ"), cpu, - rcu_state.barrier_sequence); - smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1); - } else if (rcu_segcblist_n_cbs(&rdp->cblist) && - cpu_is_offline(cpu)) { - rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, - rcu_state.barrier_sequence); - local_irq_disable(); - rcu_barrier_func((void *)cpu); - local_irq_enable(); - } else if (cpu_is_offline(cpu)) { - rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu, - rcu_state.barrier_sequence); - } else { - rcu_barrier_trace(TPS("OnlineNQ"), cpu, - rcu_state.barrier_sequence); + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + if (!rcu_segcblist_n_cbs(&rdp->cblist)) { + WRITE_ONCE(rdp->barrier_seq_snap, gseq); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence); + continue; + } + if (!rcu_rdp_cpu_online(rdp)) { + rcu_barrier_entrain(rdp); + WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence); + continue; + } + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) { + schedule_timeout_uninterruptible(1); + goto retry; } + WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq); + rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); } - cpus_read_unlock(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -4093,6 +4121,12 @@ void rcu_barrier(void) /* Mark the end of the barrier operation. */ rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence); rcu_seq_end(&rcu_state.barrier_sequence); + gseq = rcu_state.barrier_sequence; + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + + WRITE_ONCE(rdp->barrier_seq_snap, gseq); + } /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rcu_state.barrier_mutex); @@ -4140,6 +4174,7 @@ rcu_boot_init_percpu_data(int cpu) INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); + rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; @@ -4287,12 +4322,13 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); + local_irq_save(flags); + arch_spin_lock(&rcu_state.ofl_lock); rcu_dynticks_eqs_online(); - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - raw_spin_lock_irqsave_rcu_node(rnp, flags); + raw_spin_lock(&rcu_state.barrier_lock); + raw_spin_lock_rcu_node(rnp); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); + raw_spin_unlock(&rcu_state.barrier_lock); newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; /* Allow lockless access for expedited grace periods. */ @@ -4304,15 +4340,18 @@ void rcu_cpu_starting(unsigned int cpu) /* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ + /* rcu_report_qs_rnp() *really* wants some flags to restore */ + unsigned long flags2; + + local_irq_save(flags2); rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2); } else { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + raw_spin_unlock_rcu_node(rnp); } - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(rnp->ofl_seq & 0x1); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(flags); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } @@ -4326,7 +4365,7 @@ void rcu_cpu_starting(unsigned int cpu) */ void rcu_report_dead(unsigned int cpu) { - unsigned long flags; + unsigned long flags, seq_flags; unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -4340,10 +4379,8 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - raw_spin_lock(&rcu_state.ofl_lock); + local_irq_save(seq_flags); + arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); @@ -4354,10 +4391,8 @@ void rcu_report_dead(unsigned int cpu) } WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - raw_spin_unlock(&rcu_state.ofl_lock); - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(rnp->ofl_seq & 0x1); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(seq_flags); rdp->cpu_started = false; } @@ -4380,7 +4415,9 @@ void rcutree_migrate_callbacks(int cpu) rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ - local_irq_save(flags); + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + WARN_ON_ONCE(rcu_rdp_cpu_online(rdp)); + rcu_barrier_entrain(rdp); my_rdp = this_cpu_ptr(&rcu_data); my_rnp = my_rdp->mynode; rcu_nocb_lock(my_rdp); /* irqs already disabled. */ @@ -4390,10 +4427,10 @@ void rcutree_migrate_callbacks(int cpu) needwake = rcu_advance_cbs(my_rnp, rdp) || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */ needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_disable(&rdp->cblist); - WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != - !rcu_segcblist_n_cbs(&my_rdp->cblist)); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); if (rcu_rdp_is_offloaded(my_rdp)) { raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ __call_rcu_nocb_wake(my_rdp, true, flags); @@ -4440,26 +4477,10 @@ static int rcu_pm_notify(struct notifier_block *self, static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; - int kthread_prio_in = kthread_prio; struct rcu_node *rnp; struct sched_param sp; struct task_struct *t; - /* Force priority into range. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 - && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) - kthread_prio = 2; - else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) - kthread_prio = 1; - else if (kthread_prio < 0) - kthread_prio = 0; - else if (kthread_prio > 99) - kthread_prio = 99; - - if (kthread_prio != kthread_prio_in) - pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", - kthread_prio, kthread_prio_in); - rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) @@ -4570,6 +4591,7 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[2]); init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); + mutex_init(&rnp->boost_kthread_mutex); } } @@ -4585,6 +4607,28 @@ static void __init rcu_init_one(void) } /* + * Force priority from the kernel command-line into range. + */ +static void __init sanitize_kthread_prio(void) +{ + int kthread_prio_in = kthread_prio; + + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 + && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) + kthread_prio = 2; + else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + kthread_prio = 1; + else if (kthread_prio < 0) + kthread_prio = 0; + else if (kthread_prio > 99) + kthread_prio = 99; + + if (kthread_prio != kthread_prio_in) + pr_alert("%s: Limited prio to %d from %d\n", + __func__, kthread_prio, kthread_prio_in); +} + +/* * Compute the rcu_node tree geometry from kernel parameters. This cannot * replace the definitions in tree.h because those are needed to size * the ->node array in the rcu_state structure. @@ -4744,6 +4788,7 @@ void __init rcu_init(void) kfree_rcu_batch_init(); rcu_bootup_announce(); + sanitize_kthread_prio(); rcu_init_geometry(); rcu_init_one(); if (dump_tree) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 486fc901bd08..926673ebe355 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -56,8 +56,6 @@ struct rcu_node { /* Initialized from ->qsmaskinitnext at the */ /* beginning of each grace period. */ unsigned long qsmaskinitnext; - unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */ - /* Online CPUs for next grace period. */ unsigned long expmask; /* CPUs or groups that need to check in */ /* to allow the current expedited GP */ /* to complete. */ @@ -110,6 +108,9 @@ struct rcu_node { /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ + struct mutex boost_kthread_mutex; + /* Exclusion for thread spawning and affinity */ + /* manipulation. */ struct task_struct *boost_kthread_task; /* kthread that takes care of priority */ /* boosting for this rcu_node structure. */ @@ -190,6 +191,7 @@ struct rcu_data { bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ /* 4) rcu_barrier(), OOM callbacks, and expediting. */ + unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */ struct rcu_head barrier_head; int exp_dynticks_snap; /* Double-check need for IPI. */ @@ -203,6 +205,8 @@ struct rcu_data { int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ + struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */ + /* spawning */ /* The following fields are used by call_rcu, hence own cacheline. */ raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp; @@ -237,6 +241,7 @@ struct rcu_data { /* rcuc per-CPU kthread or NULL. */ unsigned int rcu_cpu_kthread_status; char rcu_cpu_has_work; + unsigned long rcuc_activity; /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ @@ -302,9 +307,8 @@ struct rcu_state { /* The following fields are guarded by the root rcu_node's lock. */ - u8 boost ____cacheline_internodealigned_in_smp; - /* Subject to priority boost. */ - unsigned long gp_seq; /* Grace-period sequence #. */ + unsigned long gp_seq ____cacheline_internodealigned_in_smp; + /* Grace-period sequence #. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ struct task_struct *gp_kthread; /* Task for grace periods. */ @@ -323,6 +327,8 @@ struct rcu_state { /* rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ @@ -355,7 +361,7 @@ struct rcu_state { const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ - raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; + arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ }; diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 237a79989aba..60197ea24ceb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -502,7 +502,8 @@ static void synchronize_rcu_expedited_wait(void) if (synchronize_rcu_expedited_wait_once(1)) return; rcu_for_each_leaf_node(rnp) { - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + mask = READ_ONCE(rnp->expmask); + for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->rcu_forced_tick_exp) continue; @@ -656,7 +657,7 @@ static void rcu_exp_handler(void *unused) */ if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - rcu_dynticks_curr_cpu_in_eqs()) { + rcu_is_cpu_rrupt_from_idle()) { rcu_report_exp_rdp(rdp); } else { WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index eeafb546a7a0..636d0546a4e9 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1169,7 +1169,7 @@ void __init rcu_init_nohz(void) struct rcu_data *rdp; #if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) + if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) need_rcu_nocb_mask = true; #endif /* #if defined(CONFIG_NO_HZ_FULL) */ @@ -1226,6 +1226,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) raw_spin_lock_init(&rdp->nocb_gp_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); rcu_cblist_init(&rdp->nocb_bypass); + mutex_init(&rdp->nocb_gp_kthread_mutex); } /* @@ -1238,6 +1239,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_data *rdp_gp; struct task_struct *t; + struct sched_param sp; if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) return; @@ -1247,20 +1249,30 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) return; /* If we didn't spawn the GP kthread first, reorganize! */ + sp.sched_priority = kthread_prio; rdp_gp = rdp->nocb_gp_rdp; + mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); if (!rdp_gp->nocb_gp_kthread) { t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, "rcuog/%d", rdp_gp->cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); return; + } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); + if (kthread_prio) + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) return; + + if (kthread_prio) + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); } @@ -1348,7 +1360,7 @@ static void __init rcu_organize_nocb_kthreads(void) */ void rcu_bind_current_to_nocb(void) { - if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) + if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask)) WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); } EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c5b45c2f68a1..8360d86db1c0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -330,7 +330,7 @@ void rcu_note_context_switch(bool preempt) * then queue the task as required based on the states * of any ongoing and expedited grace periods. */ - WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); + WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp)); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); trace_rcu_preempt_task(rcu_state.name, t->pid, @@ -556,16 +556,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } - /* Unboost if we were boosted. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) - rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex); - /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) rcu_report_exp_rnp(rnp, true); + + /* Unboost if we were boosted. */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) + rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex); } else { local_irq_restore(flags); } @@ -773,7 +773,6 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) int cpu; int i; struct list_head *lhp; - bool onl; struct rcu_data *rdp; struct rcu_node *rnp1; @@ -797,9 +796,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_cont("\n"); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { rdp = per_cpu_ptr(&rcu_data, cpu); - onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", - cpu, ".o"[onl], + cpu, ".o"[rcu_rdp_cpu_online(rdp)], (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); } @@ -996,12 +994,15 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) */ static void rcu_cpu_kthread_setup(unsigned int cpu) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); #ifdef CONFIG_RCU_BOOST struct sched_param sp; sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); #endif /* #ifdef CONFIG_RCU_BOOST */ + + WRITE_ONCE(rdp->rcuc_activity, jiffies); } #ifdef CONFIG_RCU_BOOST @@ -1172,15 +1173,14 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct sched_param sp; struct task_struct *t; + mutex_lock(&rnp->boost_kthread_mutex); if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) - return; - - rcu_state.boost = 1; + goto out; t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); if (WARN_ON_ONCE(IS_ERR(t))) - return; + goto out; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; @@ -1188,6 +1188,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ + + out: + mutex_unlock(&rnp->boost_kthread_mutex); } /* @@ -1210,14 +1213,16 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; + mutex_lock(&rnp->boost_kthread_mutex); for_each_leaf_node_possible_cpu(rnp, cpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU)); - if (cpumask_weight(cm) == 0) - cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU)); + cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (cpumask_empty(cm)) + cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); set_cpus_allowed_ptr(t, cm); + mutex_unlock(&rnp->boost_kthread_mutex); free_cpumask_var(cm); } @@ -1291,7 +1296,7 @@ static void rcu_bind_gp_kthread(void) { if (!tick_nohz_full_enabled()) return; - housekeeping_affine(current, HK_FLAG_RCU); + housekeeping_affine(current, HK_TYPE_RCU); } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 21bebf7c9030..0c5d8516516a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -379,6 +379,15 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp) return j > 2 * HZ; } +static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp) +{ + unsigned long j = jiffies - READ_ONCE(rdp->rcuc_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -430,6 +439,29 @@ static void print_cpu_stall_info(int cpu) falsepositive ? " (false positive?)" : ""); } +static void rcuc_kthread_dump(struct rcu_data *rdp) +{ + int cpu; + unsigned long j; + struct task_struct *rcuc; + + rcuc = rdp->rcu_cpu_kthread_task; + if (!rcuc) + return; + + cpu = task_cpu(rcuc); + if (cpu_is_offline(cpu) || idle_cpu(cpu)) + return; + + if (!rcu_is_rcuc_kthread_starving(rdp, &j)) + return; + + pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j); + sched_show_task(rcuc); + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); +} + /* Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(void) { @@ -601,6 +633,9 @@ static void print_cpu_stall(unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); + if (!use_softirq) + rcuc_kthread_dump(rdp); + rcu_dump_cpu_stacks(); raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 156892c22bb5..180ff9c41fa8 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -407,6 +407,13 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, } EXPORT_SYMBOL_GPL(__wait_rcu_gp); +void finish_rcuwait(struct rcuwait *w) +{ + rcu_assign_pointer(w->task, NULL); + __set_current_state(TASK_RUNNING); +} +EXPORT_SYMBOL_GPL(finish_rcuwait); + #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head) { diff --git a/kernel/resource.c b/kernel/resource.c index 5ad3eba619ba..34eaee179689 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -56,14 +56,6 @@ struct resource_constraint { static DEFINE_RWLOCK(resource_lock); -/* - * For memory hotplug, there is no way to free resource entries allocated - * by boot mem after the system is up. So for reusing the resource entry - * we need to remember the resource. - */ -static struct resource *bootmem_resource_free; -static DEFINE_SPINLOCK(bootmem_resource_lock); - static struct resource *next_resource(struct resource *p) { if (p->child) @@ -99,7 +91,7 @@ enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { - struct resource *p = PDE_DATA(file_inode(m->file)); + struct resource *p = pde_data(file_inode(m->file)); loff_t l = 0; read_lock(&resource_lock); for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) @@ -115,7 +107,7 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct resource *root = PDE_DATA(file_inode(m->file)); + struct resource *root = pde_data(file_inode(m->file)); struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; @@ -160,36 +152,19 @@ __initcall(ioresources_init); static void free_resource(struct resource *res) { - if (!res) - return; - - if (!PageSlab(virt_to_head_page(res))) { - spin_lock(&bootmem_resource_lock); - res->sibling = bootmem_resource_free; - bootmem_resource_free = res; - spin_unlock(&bootmem_resource_lock); - } else { + /** + * If the resource was allocated using memblock early during boot + * we'll leak it here: we can only return full pages back to the + * buddy and trying to be smart and reusing them eventually in + * alloc_resource() overcomplicates resource handling. + */ + if (res && PageSlab(virt_to_head_page(res))) kfree(res); - } } static struct resource *alloc_resource(gfp_t flags) { - struct resource *res = NULL; - - spin_lock(&bootmem_resource_lock); - if (bootmem_resource_free) { - res = bootmem_resource_free; - bootmem_resource_free = res->sibling; - } - spin_unlock(&bootmem_resource_lock); - - if (res) - memset(res, 0, sizeof(struct resource)); - else - res = kzalloc(sizeof(struct resource), flags); - - return res; + return kzalloc(sizeof(struct resource), flags); } /* Return the conflict entry if you can't request it */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 6d45ac3dae7f..97ac20b4f738 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -128,10 +128,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) int ret; #ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs.ptr64)) + if (get_user(ptr, &t->rseq->rseq_cs)) return -EFAULT; #else - if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr))) + if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) return -EFAULT; #endif if (!ptr) { @@ -217,9 +217,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs.ptr64); + return put_user(0UL, &t->rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64))) + if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) return -EFAULT; return 0; #endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index c83b37af155b..976092b7bd45 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) -endif # The compilers are complaining about unused variables inside an if(0) scope # block. This is daft, shut them up. @@ -25,18 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle.o fair.o rt.o deadline.o -obj-y += wait.o wait_bit.o swait.o completion.o - -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o -obj-$(CONFIG_SCHEDSTATS) += stats.o -obj-$(CONFIG_SCHED_DEBUG) += debug.o -obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -obj-$(CONFIG_CPU_FREQ) += cpufreq.o -obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o -obj-$(CONFIG_CPU_ISOLATION) += isolation.o -obj-$(CONFIG_PSI) += psi.o -obj-$(CONFIG_SCHED_CORE) += core_sched.o +# +# Build efficiency: +# +# These compilation units have roughly the same size and complexity - so their +# build parallelizes well and finishes roughly at once: +# +obj-y += core.o +obj-y += fair.o +obj-y += build_policy.o +obj-y += build_utility.o diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 8629b37d118e..16092b49ff6a 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,14 +1,35 @@ // SPDX-License-Identifier: GPL-2.0 + /* * Auto-group scheduling implementation: */ -#include <linux/nospec.h> -#include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_autogroup_sysctls[] = { + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static void __init sched_autogroup_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_autogroup_sysctls); +} +#else +#define sched_autogroup_sysctl_init() do { } while (0) +#endif + void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; @@ -198,6 +219,7 @@ void sched_autogroup_exit(struct signal_struct *sig) static int __init setup_autogroup(char *str) { sysctl_sched_autogroup_enabled = 0; + sched_autogroup_sysctl_init(); return 1; } diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index b96419974a1f..90d69f2c5eaf 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_SCHED_AUTOGROUP_H +#define _KERNEL_SCHED_AUTOGROUP_H + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { @@ -27,6 +30,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { + extern unsigned int sysctl_sched_autogroup_enabled; int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); if (enabled && task_wants_autogroup(p, tg)) @@ -58,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) } #endif /* CONFIG_SCHED_AUTOGROUP */ + +#endif /* _KERNEL_SCHED_AUTOGROUP_H */ diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c new file mode 100644 index 000000000000..e0104b45029a --- /dev/null +++ b/kernel/sched/build_policy.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * These are the scheduling policy related scheduler files, built + * in a single compilation unit for build efficiency reasons. + * + * ( Incidentally, the size of the compilation unit is roughly + * comparable to core.c and fair.c, the other two big + * compilation units. This helps balance build time, while + * coalescing source files to amortize header inclusion + * cost. ) + * + * core.c and fair.c are built separately. + */ + +/* Headers: */ +#include <linux/sched/clock.h> +#include <linux/sched/cputime.h> +#include <linux/sched/posix-timers.h> +#include <linux/sched/rt.h> + +#include <linux/cpuidle.h> +#include <linux/jiffies.h> +#include <linux/livepatch.h> +#include <linux/psi.h> +#include <linux/seqlock_api.h> +#include <linux/slab.h> +#include <linux/suspend.h> +#include <linux/tsacct_kern.h> +#include <linux/vtime.h> + +#include <uapi/linux/sched/types.h> + +#include "sched.h" + +#include "autogroup.h" +#include "stats.h" +#include "pelt.h" + +/* Source code modules: */ + +#include "idle.c" + +#include "rt.c" + +#ifdef CONFIG_SMP +# include "cpudeadline.c" +# include "pelt.c" +#endif + +#include "cputime.c" +#include "deadline.c" + diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c new file mode 100644 index 000000000000..eec0849b2aae --- /dev/null +++ b/kernel/sched/build_utility.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * These are various utility functions of the scheduler, + * built in a single compilation unit for build efficiency reasons. + * + * ( Incidentally, the size of the compilation unit is roughly + * comparable to core.c, fair.c, smp.c and policy.c, the other + * big compilation units. This helps balance build time, while + * coalescing source files to amortize header inclusion + * cost. ) + */ +#include <linux/sched/clock.h> +#include <linux/sched/cputime.h> +#include <linux/sched/debug.h> +#include <linux/sched/isolation.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/mm.h> +#include <linux/sched/rseq_api.h> +#include <linux/sched/task_stack.h> + +#include <linux/cpufreq.h> +#include <linux/cpumask_api.h> +#include <linux/cpuset.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/energy_model.h> +#include <linux/hashtable_api.h> +#include <linux/irq.h> +#include <linux/kobject_api.h> +#include <linux/membarrier.h> +#include <linux/mempolicy.h> +#include <linux/nmi.h> +#include <linux/nospec.h> +#include <linux/proc_fs.h> +#include <linux/psi.h> +#include <linux/psi.h> +#include <linux/ptrace_api.h> +#include <linux/sched_clock.h> +#include <linux/security.h> +#include <linux/spinlock_api.h> +#include <linux/swait_api.h> +#include <linux/timex.h> +#include <linux/utsname.h> +#include <linux/wait_api.h> +#include <linux/workqueue_api.h> + +#include <uapi/linux/prctl.h> +#include <uapi/linux/sched/types.h> + +#include <asm/switch_to.h> + +#include "sched.h" +#include "sched-pelt.h" +#include "stats.h" +#include "autogroup.h" + +#include "clock.c" + +#ifdef CONFIG_CGROUP_CPUACCT +# include "cpuacct.c" +#endif + +#ifdef CONFIG_CPU_FREQ +# include "cpufreq.c" +#endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +# include "cpufreq_schedutil.c" +#endif + +#ifdef CONFIG_SCHED_DEBUG +# include "debug.c" +#endif + +#ifdef CONFIG_SCHEDSTATS +# include "stats.c" +#endif + +#include "loadavg.c" +#include "completion.c" +#include "swait.c" +#include "wait_bit.c" +#include "wait.c" + +#ifdef CONFIG_SMP +# include "cpupri.c" +# include "stop_task.c" +# include "topology.c" +#endif + +#ifdef CONFIG_SCHED_CORE +# include "core_sched.c" +#endif + +#ifdef CONFIG_PSI +# include "psi.c" +#endif + +#ifdef CONFIG_MEMBARRIER +# include "membarrier.c" +#endif + +#ifdef CONFIG_CPU_ISOLATION +# include "isolation.c" +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +# include "autogroup.c" +#endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c2b2859ddd82..d9272d9061a3 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -53,15 +53,13 @@ * that is otherwise invisible (TSC gets stopped). * */ -#include "sched.h" -#include <linux/sched_clock.h> /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __weak sched_clock(void) +notrace unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); @@ -95,28 +93,28 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -static inline struct sched_clock_data *this_scd(void) +notrace static inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } -static inline struct sched_clock_data *cpu_sdc(int cpu) +notrace static inline struct sched_clock_data *cpu_sdc(int cpu) { return &per_cpu(sched_clock_data, cpu); } -int sched_clock_stable(void) +notrace int sched_clock_stable(void) { return static_branch_likely(&__sched_clock_stable); } -static void __scd_stamp(struct sched_clock_data *scd) +notrace static void __scd_stamp(struct sched_clock_data *scd) { scd->tick_gtod = ktime_get_ns(); scd->tick_raw = sched_clock(); } -static void __set_sched_clock_stable(void) +notrace static void __set_sched_clock_stable(void) { struct sched_clock_data *scd; @@ -151,7 +149,7 @@ static void __set_sched_clock_stable(void) * The only way to fully avoid random clock jumps is to boot with: * "tsc=unstable". */ -static void __sched_clock_work(struct work_struct *work) +notrace static void __sched_clock_work(struct work_struct *work) { struct sched_clock_data *scd; int cpu; @@ -177,7 +175,7 @@ static void __sched_clock_work(struct work_struct *work) static DECLARE_WORK(sched_clock_work, __sched_clock_work); -static void __clear_sched_clock_stable(void) +notrace static void __clear_sched_clock_stable(void) { if (!sched_clock_stable()) return; @@ -186,7 +184,7 @@ static void __clear_sched_clock_stable(void) schedule_work(&sched_clock_work); } -void clear_sched_clock_stable(void) +notrace void clear_sched_clock_stable(void) { __sched_clock_stable_early = 0; @@ -196,7 +194,7 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -static void __sched_clock_gtod_offset(void) +notrace static void __sched_clock_gtod_offset(void) { struct sched_clock_data *scd = this_scd(); @@ -246,12 +244,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -static inline u64 wrap_min(u64 x, u64 y) +notrace static inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -static inline u64 wrap_max(u64 x, u64 y) +notrace static inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -262,7 +260,7 @@ static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -static u64 sched_clock_local(struct sched_clock_data *scd) +notrace static u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; @@ -295,7 +293,7 @@ again: return clock; } -static u64 sched_clock_remote(struct sched_clock_data *scd) +notrace static u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock; @@ -362,7 +360,7 @@ again: * * See cpu_clock(). */ -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; u64 clock; @@ -386,7 +384,7 @@ u64 sched_clock_cpu(int cpu) } EXPORT_SYMBOL_GPL(sched_clock_cpu); -void sched_clock_tick(void) +notrace void sched_clock_tick(void) { struct sched_clock_data *scd; @@ -403,7 +401,7 @@ void sched_clock_tick(void) sched_clock_local(scd); } -void sched_clock_tick_stable(void) +notrace void sched_clock_tick_stable(void) { if (!sched_clock_stable()) return; @@ -423,7 +421,7 @@ void sched_clock_tick_stable(void) /* * We are going deep-idle (irqs are disabled): */ -void sched_clock_idle_sleep_event(void) +notrace void sched_clock_idle_sleep_event(void) { sched_clock_cpu(smp_processor_id()); } @@ -432,7 +430,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* * We just idled; resync with ktime. */ -void sched_clock_idle_wakeup_event(void) +notrace void sched_clock_idle_wakeup_event(void) { unsigned long flags; @@ -458,7 +456,7 @@ void __init sched_clock_init(void) local_irq_enable(); } -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { if (!static_branch_likely(&sched_clock_running)) return 0; @@ -476,7 +474,7 @@ u64 sched_clock_cpu(int cpu) * On bare metal this function should return the same as local_clock. * Architectures and sub-architectures can override this. */ -u64 __weak running_clock(void) +notrace u64 __weak running_clock(void) { return local_clock(); } diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a778554f9dad..35f15c26ed54 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + /* * Generic wait-for-completion handler; * @@ -11,7 +12,6 @@ * typically be used for exclusion which gives rise to priority inversion. * Waiting for completion is a typically sync point, but not an exclusion point. */ -#include "sched.h" /** * complete: - signals a single thread waiting on this completion diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83872f95a1ea..d575b4914925 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6,26 +6,90 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> -#undef CREATE_TRACE_POINTS +#include <linux/highmem.h> +#include <linux/hrtimer_api.h> +#include <linux/ktime_api.h> +#include <linux/sched/signal.h> +#include <linux/syscalls_api.h> +#include <linux/debug_locks.h> +#include <linux/prefetch.h> +#include <linux/capability.h> +#include <linux/pgtable_api.h> +#include <linux/wait_bit.h> +#include <linux/jiffies.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/hardirq.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/debug.h> +#include <linux/sched/isolation.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/mm.h> +#include <linux/sched/nohz.h> +#include <linux/sched/rseq_api.h> +#include <linux/sched/rt.h> -#include "sched.h" - -#include <linux/nospec.h> #include <linux/blkdev.h> +#include <linux/context_tracking.h> +#include <linux/cpuset.h> +#include <linux/delayacct.h> +#include <linux/init_task.h> +#include <linux/interrupt.h> +#include <linux/ioprio.h> +#include <linux/kallsyms.h> #include <linux/kcov.h> +#include <linux/kprobes.h> +#include <linux/llist_api.h> +#include <linux/mmu_context.h> +#include <linux/mmzone.h> +#include <linux/mutex_api.h> +#include <linux/nmi.h> +#include <linux/nospec.h> +#include <linux/perf_event_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/rcuwait_api.h> +#include <linux/sched/wake_q.h> #include <linux/scs.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/vtime.h> +#include <linux/wait_api.h> +#include <linux/workqueue_api.h> + +#ifdef CONFIG_PREEMPT_DYNAMIC +# ifdef CONFIG_GENERIC_ENTRY +# include <linux/entry-common.h> +# endif +#endif + +#include <uapi/linux/sched/types.h> #include <asm/switch_to.h> #include <asm/tlb.h> -#include "../workqueue_internal.h" -#include "../../fs/io-wq.h" -#include "../smpboot.h" +#define CREATE_TRACE_POINTS +#include <linux/sched/rseq_api.h> +#include <trace/events/sched.h> +#undef CREATE_TRACE_POINTS + +#include "sched.h" +#include "stats.h" +#include "autogroup.h" +#include "autogroup.h" #include "pelt.h" #include "smp.h" +#include "stats.h" + +#include "../workqueue_internal.h" +#include "../../fs/io-wq.h" +#include "../smpboot.h" /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -36,6 +100,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); @@ -1024,13 +1089,13 @@ int get_nohz_timer_target(void) struct sched_domain *sd; const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); + hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); rcu_read_lock(); for_each_domain(cpu, sd) { @@ -1046,7 +1111,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); cpu = default_cpu; unlock: rcu_read_unlock(); @@ -4279,7 +4344,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); #ifdef CONFIG_NUMA_BALANCING -void set_numabalancing_state(bool enabled) +int sysctl_numa_balancing_mode; + +static void __set_numabalancing_state(bool enabled) { if (enabled) static_branch_enable(&sched_numa_balancing); @@ -4287,13 +4354,22 @@ void set_numabalancing_state(bool enabled) static_branch_disable(&sched_numa_balancing); } +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL; + else + sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED; + __set_numabalancing_state(enabled); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; - int state = static_branch_likely(&sched_numa_balancing); + int state = sysctl_numa_balancing_mode; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4303,8 +4379,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; - if (write) - set_numabalancing_state(state); + if (write) { + sysctl_numa_balancing_mode = state; + __set_numabalancing_state(state); + } return err; } #endif @@ -4424,6 +4502,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); + #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4439,18 +4518,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -4461,7 +4545,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} +void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); } @@ -4825,7 +4912,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) { struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; - long prev_state; + unsigned int prev_state; /* * The previous task will have left us with a preempt_count of 2 @@ -5370,7 +5457,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5391,7 +5478,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5822,8 +5909,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } if (schedstat_enabled() && rq->core->core_forceidle_count) { - if (cookie) - rq->core->core_forceidle_start = rq_clock(rq->core); + rq->core->core_forceidle_start = rq_clock(rq->core); rq->core->core_forceidle_occupation = occ; } @@ -6290,7 +6376,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -6345,8 +6431,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. */ - if (blk_needs_flush_plug(tsk)) - blk_flush_plug(tsk->plug, true); + blk_flush_plug(tsk->plug, true); } static void sched_update_worker(struct task_struct *tsk) @@ -6483,17 +6568,31 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; - preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#ifndef preempt_schedule_dynamic_enabled +#define preempt_schedule_dynamic_enabled preempt_schedule +#define preempt_schedule_dynamic_disabled NULL +#endif +DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); +void __sched notrace dynamic_preempt_schedule(void) +{ + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule)) + return; + preempt_schedule(); +} +NOKPROBE_SYMBOL(dynamic_preempt_schedule); +EXPORT_SYMBOL(dynamic_preempt_schedule); +#endif #endif - /** * preempt_schedule_notrace - preempt_schedule called by tracing @@ -6548,147 +6647,27 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); -EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#ifndef preempt_schedule_notrace_dynamic_enabled +#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +#define preempt_schedule_notrace_dynamic_disabled NULL #endif - -#endif /* CONFIG_PREEMPTION */ - -#ifdef CONFIG_PREEMPT_DYNAMIC - -#include <linux/entry-common.h> - -/* - * SC:cond_resched - * SC:might_resched - * SC:preempt_schedule - * SC:preempt_schedule_notrace - * SC:irqentry_exit_cond_resched - * - * - * NONE: - * cond_resched <- __cond_resched - * might_resched <- RET0 - * preempt_schedule <- NOP - * preempt_schedule_notrace <- NOP - * irqentry_exit_cond_resched <- NOP - * - * VOLUNTARY: - * cond_resched <- __cond_resched - * might_resched <- __cond_resched - * preempt_schedule <- NOP - * preempt_schedule_notrace <- NOP - * irqentry_exit_cond_resched <- NOP - * - * FULL: - * cond_resched <- RET0 - * might_resched <- RET0 - * preempt_schedule <- preempt_schedule - * preempt_schedule_notrace <- preempt_schedule_notrace - * irqentry_exit_cond_resched <- irqentry_exit_cond_resched - */ - -enum { - preempt_dynamic_undefined = -1, - preempt_dynamic_none, - preempt_dynamic_voluntary, - preempt_dynamic_full, -}; - -int preempt_dynamic_mode = preempt_dynamic_undefined; - -int sched_dynamic_mode(const char *str) -{ - if (!strcmp(str, "none")) - return preempt_dynamic_none; - - if (!strcmp(str, "voluntary")) - return preempt_dynamic_voluntary; - - if (!strcmp(str, "full")) - return preempt_dynamic_full; - - return -EINVAL; -} - -void sched_dynamic_update(int mode) -{ - /* - * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in - * the ZERO state, which is invalid. - */ - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, __preempt_schedule_func); - static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); - static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); - - switch (mode) { - case preempt_dynamic_none: - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, (void *)&__static_call_return0); - static_call_update(preempt_schedule, NULL); - static_call_update(preempt_schedule_notrace, NULL); - static_call_update(irqentry_exit_cond_resched, NULL); - pr_info("Dynamic Preempt: none\n"); - break; - - case preempt_dynamic_voluntary: - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, NULL); - static_call_update(preempt_schedule_notrace, NULL); - static_call_update(irqentry_exit_cond_resched, NULL); - pr_info("Dynamic Preempt: voluntary\n"); - break; - - case preempt_dynamic_full: - static_call_update(cond_resched, (void *)&__static_call_return0); - static_call_update(might_resched, (void *)&__static_call_return0); - static_call_update(preempt_schedule, __preempt_schedule_func); - static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); - static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: full\n"); - break; - } - - preempt_dynamic_mode = mode; -} - -static int __init setup_preempt_mode(char *str) -{ - int mode = sched_dynamic_mode(str); - if (mode < 0) { - pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); - return 0; - } - - sched_dynamic_update(mode); - return 1; -} -__setup("preempt=", setup_preempt_mode); - -static void __init preempt_dynamic_init(void) +DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); +void __sched notrace dynamic_preempt_schedule_notrace(void) { - if (preempt_dynamic_mode == preempt_dynamic_undefined) { - if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { - sched_dynamic_update(preempt_dynamic_none); - } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { - sched_dynamic_update(preempt_dynamic_voluntary); - } else { - /* Default static call setting, nothing to do */ - WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); - preempt_dynamic_mode = preempt_dynamic_full; - pr_info("Dynamic Preempt: full\n"); - } - } + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace)) + return; + preempt_schedule_notrace(); } +NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); +EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); +#endif +#endif -#else /* !CONFIG_PREEMPT_DYNAMIC */ - -static inline void preempt_dynamic_init(void) { } - -#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ +#endif /* CONFIG_PREEMPTION */ /* * This is the entry point to schedule() from kernel preemption @@ -8195,11 +8174,35 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define cond_resched_dynamic_enabled __cond_resched +#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); +#define might_resched_dynamic_enabled __cond_resched +#define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); +int __sched dynamic_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_cond_resched)) + return 0; + return __cond_resched(); +} +EXPORT_SYMBOL(dynamic_cond_resched); + +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); +int __sched dynamic_might_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_might_resched)) + return 0; + return __cond_resched(); +} +EXPORT_SYMBOL(dynamic_might_resched); +#endif #endif /* @@ -8219,9 +8222,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; spin_lock(lock); @@ -8239,9 +8240,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { read_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; read_lock(lock); @@ -8259,9 +8258,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { write_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; write_lock(lock); @@ -8270,6 +8267,154 @@ int __cond_resched_rwlock_write(rwlock_t *lock) } EXPORT_SYMBOL(__cond_resched_rwlock_write); +#ifdef CONFIG_PREEMPT_DYNAMIC + +#ifdef CONFIG_GENERIC_ENTRY +#include <linux/entry-common.h> +#endif + +/* + * SC:cond_resched + * SC:might_resched + * SC:preempt_schedule + * SC:preempt_schedule_notrace + * SC:irqentry_exit_cond_resched + * + * + * NONE: + * cond_resched <- __cond_resched + * might_resched <- RET0 + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * VOLUNTARY: + * cond_resched <- __cond_resched + * might_resched <- __cond_resched + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * FULL: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + */ + +enum { + preempt_dynamic_undefined = -1, + preempt_dynamic_none, + preempt_dynamic_voluntary, + preempt_dynamic_full, +}; + +int preempt_dynamic_mode = preempt_dynamic_undefined; + +int sched_dynamic_mode(const char *str) +{ + if (!strcmp(str, "none")) + return preempt_dynamic_none; + + if (!strcmp(str, "voluntary")) + return preempt_dynamic_voluntary; + + if (!strcmp(str, "full")) + return preempt_dynamic_full; + + return -EINVAL; +} + +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#else +#error "Unsupported PREEMPT_DYNAMIC mechanism" +#endif + +void sched_dynamic_update(int mode) +{ + /* + * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in + * the ZERO state, which is invalid. + */ + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + + switch (mode) { + case preempt_dynamic_none: + preempt_dynamic_enable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: none\n"); + break; + + case preempt_dynamic_voluntary: + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: voluntary\n"); + break; + + case preempt_dynamic_full: + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: full\n"); + break; + } + + preempt_dynamic_mode = mode; +} + +static int __init setup_preempt_mode(char *str) +{ + int mode = sched_dynamic_mode(str); + if (mode < 0) { + pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); + return 0; + } + + sched_dynamic_update(mode); + return 1; +} +__setup("preempt=", setup_preempt_mode); + +static void __init preempt_dynamic_init(void) +{ + if (preempt_dynamic_mode == preempt_dynamic_undefined) { + if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { + sched_dynamic_update(preempt_dynamic_none); + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { + sched_dynamic_update(preempt_dynamic_voluntary); + } else { + /* Default static call setting, nothing to do */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); + preempt_dynamic_mode = preempt_dynamic_full; + pr_info("Dynamic Preempt: full\n"); + } + } +} + +#else /* !CONFIG_PREEMPT_DYNAMIC */ + +static inline void preempt_dynamic_init(void) { } + +#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ + /** * yield - yield the current processor to other threads. * @@ -8378,9 +8523,7 @@ int io_schedule_prepare(void) int old_iowait = current->in_iowait; current->in_iowait = 1; - if (current->plug) - blk_flush_plug(current->plug, true); - + blk_flush_plug(current->plug, true); return old_iowait; } @@ -8642,14 +8785,6 @@ void __init init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); - /* - * The idle task doesn't need the kthread struct to function, but it - * is dressed up as a per-CPU kthread and thus needs to play the part - * if we want to avoid special-casing it in code that deals with per-CPU - * kthreads. - */ - set_kthread_struct(idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); @@ -8715,7 +8850,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, { int ret = 1; - if (!cpumask_weight(cur)) + if (cpumask_empty(cur)) return ret; ret = dl_cpuset_cpumask_can_shrink(cur, trial); @@ -8743,8 +8878,11 @@ int task_can_attach(struct task_struct *p, } if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) - ret = dl_task_can_attach(p, cs_cpus_allowed); + cs_cpus_allowed)) { + int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); + + ret = dl_cpu_busy(cpu, p); + } out: return ret; @@ -9028,8 +9166,10 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - if (dl_cpu_busy(cpu)) - return -EBUSY; + int ret = dl_cpu_busy(cpu, NULL); + + if (ret) + return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; @@ -9059,6 +9199,7 @@ int sched_cpu_activate(unsigned int cpu) set_cpu_active(cpu, true); if (sched_smp_initialized) { + sched_update_numa(cpu, true); sched_domains_numa_masks_set(cpu); cpuset_cpu_active(); } @@ -9137,10 +9278,12 @@ int sched_cpu_deactivate(unsigned int cpu) if (!sched_smp_initialized) return 0; + sched_update_numa(cpu, false); ret = cpuset_cpu_inactive(cpu); if (ret) { balance_push_set(cpu, false); set_cpu_active(cpu, true); + sched_update_numa(cpu, true); return ret; } sched_domains_numa_masks_clear(cpu); @@ -9243,7 +9386,7 @@ int sched_cpu_dying(unsigned int cpu) void __init sched_init_smp(void) { - sched_init_numa(); + sched_init_numa(NUMA_NO_NODE); /* * There's no userspace yet to cause hotplug operations; hence all the @@ -9255,7 +9398,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) BUG(); current->flags &= ~PF_NO_SETAFFINITY; sched_init_granularity(); @@ -9355,7 +9498,6 @@ void __init sched_init(void) #endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); #ifdef CONFIG_SMP init_defrootdomain(); @@ -9469,6 +9611,14 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* + * The idle task doesn't need the kthread struct to function, but it + * is dressed up as a per-CPU kthread and thus needs to play the part + * if we want to avoid special-casing it in code that deals with per-CPU + * kthreads. + */ + WARN_ON(!set_kthread_struct(current)); + + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 1fb45672ec85..38a2cec21014 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -1,8 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/prctl.h> -#include "sched.h" - /* * A simple wrapper around refcount. An allocated sched_core_cookie's * address is used to compute the cookie of the task. @@ -277,7 +274,7 @@ void __sched_core_account_forceidle(struct rq *rq) rq_i = cpu_rq(i); p = rq_i->core_pick ?: rq_i->curr; - if (!p->core_cookie) + if (p == rq_i->idle) continue; __schedstat_add(p->stats.core_forceidle_sum, delta); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 3d06c5e4220d..0de9dda09949 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 + /* * CPU accounting code for task groups. * * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ -#include <asm/irq_regs.h> -#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { @@ -334,14 +333,13 @@ static struct cftype files[] = { */ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { + unsigned int cpu = task_cpu(tsk); struct cpuacct *ca; - rcu_read_lock(); + lockdep_assert_rq_held(cpu_rq(cpu)); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - __this_cpu_add(*ca->cpuusage, cputime); - - rcu_read_unlock(); + *per_cpu_ptr(ca->cpuusage, cpu) += cputime; } /* @@ -353,10 +351,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { struct cpuacct *ca; - rcu_read_lock(); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) __this_cpu_add(ca->cpustat->cpustat[index], val); - rcu_read_unlock(); } struct cgroup_subsys cpuacct_cgrp_subsys = { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index ceb03d76c0cc..02d970a879ed 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * kernel/sched/cpudl.c + * kernel/sched/cpudeadline.c * * Global CPU deadline management * * Author: Juri Lelli <j.lelli@sssup.it> */ -#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 7c2fe50fd76d..5252fb191fae 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,9 +5,6 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ -#include <linux/cpufreq.h> - -#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 26778884d9ab..3dbf351d12d5 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -6,13 +6,6 @@ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include "sched.h" - -#include <linux/sched/cpufreq.h> -#include <trace/events/power.h> - #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) struct sugov_tunables { @@ -289,6 +282,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) * into the same scale so we can compare. */ boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); if (sg_cpu->util < boost) sg_cpu->util = boost; } @@ -348,8 +342,11 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. + * + * Except when the rq is capped by uclamp_max. */ - if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { + if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && + sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; /* Restore cached freq as next_freq has changed */ @@ -395,8 +392,11 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, /* * Do not reduce the target performance level if the CPU has not been * idle recently, as the reduction is likely to be premature then. + * + * Except when the rq is capped by uclamp_max. */ - if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && + sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), @@ -539,7 +539,7 @@ ATTRIBUTE_GROUPS(sugov); static void sugov_tunables_free(struct kobject *kobj) { - struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + struct gov_attr_set *attr_set = to_gov_attr_set(kobj); kfree(to_sugov_tunables(attr_set)); } diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index d583f2aa744e..fa9ce9d83683 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -22,7 +22,6 @@ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ -#include "sched.h" /* * p->rt_priority p->prio newpri cpupri diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b7ec42732b28..78a233d43757 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,7 +2,6 @@ /* * Simple CPU accounting cgroup controller */ -#include "sched.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d2c072b0ef01..fb4255ae0b2c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -15,10 +15,6 @@ * Michael Trimarchi <michael@amarulasolutions.com>, * Fabio Checconi <fchecconi@gmail.com> */ -#include "sched.h" -#include "pelt.h" - -struct dl_bandwidth def_dl_bandwidth; static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { @@ -130,6 +126,21 @@ static inline bool dl_bw_visited(int cpu, u64 gen) rd->visit_gen = gen; return false; } + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -150,9 +161,38 @@ static inline bool dl_bw_visited(int cpu, u64 gen) { return false; } + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} #endif static inline +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); +} + +static inline bool +__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; +} + +static inline void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -408,7 +448,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; - return dl_rq->root.rb_leftmost == &dl_se->rb_node; + return rb_first_cached(&dl_rq->root) == &dl_se->rb_node; } static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); @@ -423,12 +463,10 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); - raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); if (global_rt_runtime() == RUNTIME_INF) dl_b->bw = -1; else dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); - raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); dl_b->total_bw = 0; } @@ -683,15 +721,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } -static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) -{ - return false; -} - -static inline void pull_dl_task(struct rq *rq) -{ -} - static inline void deadline_queue_push_tasks(struct rq *rq) { } @@ -1393,6 +1422,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) timer->function = inactive_task_timer; } +#define __node_2_dle(node) \ + rb_entry((node), struct sched_dl_entity, rb_node) + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -1422,10 +1454,9 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) cpudl_clear(&rq->rd->cpudl, rq->cpu); cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { - struct rb_node *leftmost = dl_rq->root.rb_leftmost; - struct sched_dl_entity *entry; + struct rb_node *leftmost = rb_first_cached(&dl_rq->root); + struct sched_dl_entity *entry = __node_2_dle(leftmost); - entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } @@ -1466,9 +1497,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) dec_dl_migration(dl_se, dl_rq); } -#define __node_2_dle(node) \ - rb_entry((node), struct sched_dl_entity, rb_node) - static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) { return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); @@ -1931,15 +1959,14 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) deadline_queue_push_tasks(rq); } -static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, - struct dl_rq *dl_rq) +static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) { struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; - return rb_entry(left, struct sched_dl_entity, rb_node); + return __node_2_dle(left); } static struct task_struct *pick_task_dl(struct rq *rq) @@ -1951,7 +1978,7 @@ static struct task_struct *pick_task_dl(struct rq *rq) if (!sched_dl_runnable(rq)) return NULL; - dl_se = pick_next_dl_entity(rq, dl_rq); + dl_se = pick_next_dl_entity(dl_rq); BUG_ON(!dl_se); p = dl_task_of(dl_se); @@ -2034,15 +2061,17 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; + struct rb_node *next_node; if (!has_pushable_dl_tasks(rq)) return NULL; + next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); + next_node: if (next_node) { - p = rb_entry(next_node, struct task_struct, pushable_dl_tasks); + p = __node_2_pdl(next_node); if (pick_dl_task(rq, p, cpu)) return p; @@ -2208,8 +2237,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, - struct task_struct, pushable_dl_tasks); + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); @@ -2240,12 +2268,6 @@ static int push_dl_task(struct rq *rq) return 0; retry: - if (is_migration_disabled(next_task)) - return 0; - - if (WARN_ON(next_task == rq->curr)) - return 0; - /* * If next_task preempts rq->curr, and rq->curr * can move away, it makes sense to just reschedule @@ -2258,6 +2280,12 @@ retry: return 0; } + if (is_migration_disabled(next_task)) + return 0; + + if (WARN_ON(next_task == rq->curr)) + return 0; + /* We might release rq lock */ get_task_struct(next_task); @@ -2731,9 +2759,6 @@ void sched_dl_do_global(void) int cpu; unsigned long flags; - def_dl_bandwidth.dl_period = global_rt_period(); - def_dl_bandwidth.dl_runtime = global_rt_runtime(); - if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); @@ -2955,41 +2980,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) } #ifdef CONFIG_SMP -int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) -{ - unsigned long flags, cap; - unsigned int dest_cpu; - struct dl_bw *dl_b; - bool overflow; - int ret; - - dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); - - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cap = dl_bw_capacity(dest_cpu); - overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw); - if (overflow) { - ret = -EBUSY; - } else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - int cpus = dl_bw_cpus(dest_cpu); - - __dl_add(dl_b, p->dl.dl_bw, cpus); - ret = 0; - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); - - return ret; -} - int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -3011,7 +3001,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -bool dl_cpu_busy(unsigned int cpu) +int dl_cpu_busy(int cpu, struct task_struct *p) { unsigned long flags, cap; struct dl_bw *dl_b; @@ -3021,11 +3011,22 @@ bool dl_cpu_busy(unsigned int cpu) dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); cap = dl_bw_capacity(cpu); - overflow = __dl_overflow(dl_b, cap, 0, 0); + overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); + + if (!overflow && p) { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - return overflow; + return overflow ? -EBUSY : 0; } #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index aa29211de1bf..bb3d63bdf4ae 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,7 +6,6 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ -#include "sched.h" /* * This allows printing both to /proc/sched_debug and @@ -931,25 +930,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, static void sched_show_numa(struct task_struct *p, struct seq_file *m) { #ifdef CONFIG_NUMA_BALANCING - struct mempolicy *pol; - if (p->mm) P(mm->numa_scan_seq); - task_lock(p); - pol = p->mempolicy; - if (pol && !(pol->flags & MPOL_F_MORON)) - pol = NULL; - mpol_get(pol); - task_unlock(p); - P(numa_pages_migrated); P(numa_preferred_nid); P(total_numa_faults); SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); - mpol_put(pol); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 095b0aa378df..ee0664c9d291 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,7 +20,38 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ +#include <linux/energy_model.h> +#include <linux/mmap_lock.h> +#include <linux/hugetlb_inline.h> +#include <linux/jiffies.h> +#include <linux/mm_api.h> +#include <linux/highmem.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> +#include <linux/sched/isolation.h> + +#include <linux/cpuidle.h> +#include <linux/interrupt.h> +#include <linux/mempolicy.h> +#include <linux/mutex_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/ratelimit.h> + +#include <asm/switch_to.h> + +#include <linux/sched/cond_resched.h> + #include "sched.h" +#include "stats.h" +#include "autogroup.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -1259,10 +1290,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng) /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, - int maxdist, bool task) + int lim_dist, bool task) { unsigned long score = 0; - int node; + int node, max_dist; /* * All nodes are directly connected, and the same distance @@ -1271,6 +1302,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, if (sched_numa_topology_type == NUMA_DIRECT) return 0; + /* sched_max_numa_distance may be changed in parallel. */ + max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, * which should be ok given the number of nodes rarely exceeds 8. @@ -1283,7 +1316,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * The furthest away nodes in the system are not interesting * for placement; nid was already counted. */ - if (dist == sched_max_numa_distance || node == nid) + if (dist >= max_dist || node == nid) continue; /* @@ -1293,8 +1326,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * "hoplimit", only nodes closer by than "hoplimit" are part * of each group. Skip other nodes. */ - if (sched_numa_topology_type == NUMA_BACKPLANE && - dist >= maxdist) + if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) continue; /* Add up the faults from nearby nodes. */ @@ -1312,8 +1344,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * This seems to result in good task placement. */ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { - faults *= (sched_max_numa_distance - dist); - faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + faults *= (max_dist - dist); + faults /= (max_dist - LOCAL_DISTANCE); } score += faults; @@ -1489,6 +1521,7 @@ struct task_numa_env { int src_cpu, src_nid; int dst_cpu, dst_nid; + int imb_numa_nr; struct numa_stats src_stats, dst_stats; @@ -1503,7 +1536,7 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int dst_weight); + int dst_running, int imb_numa_nr); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1884,7 +1917,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); imbalance = adjust_numa_imbalance(imbalance, dst_running, - env->dst_stats.weight); + env->imb_numa_nr); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -1949,8 +1982,10 @@ static int task_numa_migrate(struct task_struct *p) */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - if (sd) + if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + env.imb_numa_nr = sd->imb_numa_nr; + } rcu_read_unlock(); /* @@ -1985,7 +2020,7 @@ static int task_numa_migrate(struct task_struct *p) */ ng = deref_curr_numa_group(p); if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -2083,13 +2118,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group) unsigned long faults, max_faults = 0; int nid, active_nodes = 0; - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults > max_faults) max_faults = faults; } - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults * ACTIVE_NODE_FRACTION > max_faults) active_nodes++; @@ -2243,7 +2278,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) dist = sched_max_numa_distance; - for_each_online_node(node) { + for_each_node_state(node, N_CPU) { score = group_weight(p, node, dist); if (score > max_score) { max_score = score; @@ -2262,7 +2297,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) * inside the highest scoring group of nodes. The nodemask tricks * keep the complexity of the search down. */ - nodes = node_online_map; + nodes = node_states[N_CPU]; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsigned long max_faults = 0; nodemask_t max_group = NODE_MASK_NONE; @@ -2401,6 +2436,21 @@ static void task_numa_placement(struct task_struct *p) } } + /* Cannot migrate task to CPU-less node */ + if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) { + int near_nid = max_nid; + int distance, near_distance = INT_MAX; + + for_each_node_state(nid, N_CPU) { + distance = node_distance(max_nid, nid); + if (distance < near_distance) { + near_nid = nid; + near_distance = distance; + } + } + max_nid = near_nid; + } + if (ng) { numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); @@ -2825,6 +2875,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; + p->numa_pages_migrated = 0; + p->total_numa_faults = 0; RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; @@ -3028,9 +3080,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u32 divider = get_pelt_divider(&se->avg); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } #else static inline void @@ -3381,7 +3435,6 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } - /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant @@ -3449,15 +3502,14 @@ void set_task_rq_fair(struct sched_entity *se, * XXX: only do this for the part of runnable > running ? * */ - static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3466,23 +3518,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * divider; + new_sum = se->avg.util_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.util_sum; + se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + add_positive(&cfs_rq->avg.util_avg, delta_avg); + add_positive(&cfs_rq->avg.util_sum, delta_sum); + + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3493,19 +3552,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * divider; + new_sum = se->avg.runnable_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.runnable_sum; + se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + add_positive(&cfs_rq->avg.runnable_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); } static inline void update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; unsigned long load_avg; u64 load_sum = 0; + s64 delta_sum; u32 divider; if (!runnable_sum) @@ -3532,7 +3597,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * assuming all tasks are equally runnable. */ if (scale_load_down(gcfs_rq->load.weight)) { - load_sum = div_s64(gcfs_rq->avg.load_sum, + load_sum = div_u64(gcfs_rq->avg.load_sum, scale_load_down(gcfs_rq->load.weight)); } @@ -3549,19 +3614,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); - load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, divider); + load_sum = se_weight(se) * runnable_sum; + load_avg = div_u64(load_sum, divider); - se->avg.load_sum = runnable_sum; - - delta = load_avg - se->avg.load_avg; - if (!delta) + delta_avg = load_avg - se->avg.load_avg; + if (!delta_avg) return; - se->avg.load_avg = load_avg; + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; - add_positive(&cfs_rq->avg.load_avg, delta); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + se->avg.load_sum = runnable_sum; + se->avg.load_avg = load_avg; + add_positive(&cfs_rq->avg.load_avg, delta_avg); + add_positive(&cfs_rq->avg.load_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3652,7 +3720,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed load. + * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true. @@ -3677,15 +3745,32 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_load; sub_positive(&sa->load_avg, r); - sa->load_sum = sa->load_avg * divider; + sub_positive(&sa->load_sum, r * divider); + /* See sa->util_sum below */ + sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); r = removed_util; sub_positive(&sa->util_avg, r); - sa->util_sum = sa->util_avg * divider; + sub_positive(&sa->util_sum, r * divider); + /* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); r = removed_runnable; sub_positive(&sa->runnable_avg, r); - sa->runnable_sum = sa->runnable_avg * divider; + sub_positive(&sa->runnable_sum, r * divider); + /* See sa->util_sum above */ + sa->runnable_sum = max_t(u32, sa->runnable_sum, + sa->runnable_avg * PELT_MIN_DIVIDER); /* * removed_runnable is the unweighted version of removed_load so we @@ -3772,17 +3857,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = get_pelt_divider(&cfs_rq->avg); - dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -8539,6 +8625,8 @@ group_type group_classify(unsigned int imbalance_pct, * * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings * of @dst_cpu are idle and @sg has lower priority. + * + * Return: true if @dst_cpu can pull tasks, false otherwise. */ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, @@ -8614,6 +8702,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. + * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_status: Holds flag indicating the status of the sched_group @@ -9003,9 +9092,9 @@ static bool update_pick_idlest(struct sched_group *idlest, * This is an approximation as the number of running tasks may not be * related to the number of busy CPUs due to sched_setaffinity. */ -static inline bool allow_numa_imbalance(int dst_running, int dst_weight) +static inline bool allow_numa_imbalance(int running, int imb_numa_nr) { - return (dst_running < (dst_weight >> 2)); + return running <= imb_numa_nr; } /* @@ -9139,12 +9228,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) return idlest; #endif /* - * Otherwise, keep the task on this node to stay close - * its wakeup source and improve locality. If there is - * a real need of migration, periodic load balance will - * take care of it. + * Otherwise, keep the task close to the wakeup source + * and improve locality if the number of running tasks + * would remain below threshold where an imbalance is + * allowed. If there is a real need of migration, + * periodic load balance will take care of it. */ - if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) + if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr)) return NULL; } @@ -9236,9 +9326,9 @@ next_group: #define NUMA_IMBALANCE_MIN 2 static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int dst_weight) + int dst_running, int imb_numa_nr) { - if (!allow_numa_imbalance(dst_running, dst_weight)) + if (!allow_numa_imbalance(dst_running, imb_numa_nr)) return imbalance; /* @@ -9350,7 +9440,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* Consider allowing a small imbalance between NUMA groups */ if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running, busiest->group_weight); + local->sum_nr_running + 1, env->sd->imb_numa_nr); } return; @@ -9421,12 +9511,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. + * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * - * @env: The load balancing environment. - * * Return: - The busiest group if imbalance exists. */ static struct sched_group *find_busiest_group(struct lb_env *env) @@ -10315,7 +10404,7 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set * anywhere yet. */ @@ -10324,7 +10413,7 @@ static inline int find_new_ilb(void) int ilb; const struct cpumask *hk_mask; - hk_mask = housekeeping_cpumask(HK_FLAG_MISC); + hk_mask = housekeeping_cpumask(HK_TYPE_MISC); for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { @@ -10340,7 +10429,7 @@ static inline int find_new_ilb(void) /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). + * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -10553,7 +10642,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) + if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) return; /* @@ -10769,7 +10858,7 @@ static void nohz_newidle_balance(struct rq *this_rq) * This CPU doesn't want to be disturbed by scheduler * housekeeping */ - if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) + if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) return; /* Will wake up very soon. No time for doing anything else*/ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index d17b0a5ce6ac..8f8b5020e76a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,9 +6,6 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ -#include "sched.h" - -#include <trace/events/power.h> /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 7f06eaf12818..373d42c707bc 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,136 +7,179 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ -#include "sched.h" + +enum hk_flags { + HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), + HK_FLAG_RCU = BIT(HK_TYPE_RCU), + HK_FLAG_MISC = BIT(HK_TYPE_MISC), + HK_FLAG_SCHED = BIT(HK_TYPE_SCHED), + HK_FLAG_TICK = BIT(HK_TYPE_TICK), + HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), + HK_FLAG_WQ = BIT(HK_TYPE_WQ), + HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), + HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD), +}; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); EXPORT_SYMBOL_GPL(housekeeping_overridden); -static cpumask_var_t housekeeping_mask; -static unsigned int housekeeping_flags; -bool housekeeping_enabled(enum hk_flags flags) +struct housekeeping { + cpumask_var_t cpumasks[HK_TYPE_MAX]; + unsigned long flags; +}; + +static struct housekeeping housekeeping; + +bool housekeeping_enabled(enum hk_type type) { - return !!(housekeeping_flags & flags); + return !!(housekeeping.flags & BIT(type)); } EXPORT_SYMBOL_GPL(housekeeping_enabled); -int housekeeping_any_cpu(enum hk_flags flags) +int housekeeping_any_cpu(enum hk_type type) { int cpu; if (static_branch_unlikely(&housekeeping_overridden)) { - if (housekeeping_flags & flags) { - cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id()); + if (housekeeping.flags & BIT(type)) { + cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id()); if (cpu < nr_cpu_ids) return cpu; - return cpumask_any_and(housekeeping_mask, cpu_online_mask); + return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); } } return smp_processor_id(); } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); -const struct cpumask *housekeeping_cpumask(enum hk_flags flags) +const struct cpumask *housekeeping_cpumask(enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - return housekeeping_mask; + if (housekeeping.flags & BIT(type)) + return housekeeping.cpumasks[type]; return cpu_possible_mask; } EXPORT_SYMBOL_GPL(housekeeping_cpumask); -void housekeeping_affine(struct task_struct *t, enum hk_flags flags) +void housekeeping_affine(struct task_struct *t, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - set_cpus_allowed_ptr(t, housekeeping_mask); + if (housekeeping.flags & BIT(type)) + set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]); } EXPORT_SYMBOL_GPL(housekeeping_affine); -bool housekeeping_test_cpu(int cpu, enum hk_flags flags) +bool housekeeping_test_cpu(int cpu, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - return cpumask_test_cpu(cpu, housekeeping_mask); + if (housekeeping.flags & BIT(type)) + return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]); return true; } EXPORT_SYMBOL_GPL(housekeeping_test_cpu); void __init housekeeping_init(void) { - if (!housekeeping_flags) + enum hk_type type; + + if (!housekeeping.flags) return; static_branch_enable(&housekeeping_overridden); - if (housekeeping_flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_TICK) sched_tick_offload_init(); - /* We need at least one CPU to handle housekeeping work */ - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); + for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { + /* We need at least one CPU to handle housekeeping work */ + WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type])); + } } -static int __init housekeeping_setup(char *str, enum hk_flags flags) +static void __init housekeeping_setup_type(enum hk_type type, + cpumask_var_t housekeeping_staging) { - cpumask_var_t non_housekeeping_mask; - cpumask_var_t tmp; + + alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]); + cpumask_copy(housekeeping.cpumasks[type], + housekeeping_staging); +} + +static int __init housekeeping_setup(char *str, unsigned long flags) +{ + cpumask_var_t non_housekeeping_mask, housekeeping_staging; + int err = 0; + + if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { + if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { + pr_warn("Housekeeping: nohz unsupported." + " Build with CONFIG_NO_HZ_FULL\n"); + return 0; + } + } alloc_bootmem_cpumask_var(&non_housekeeping_mask); if (cpulist_parse(str, non_housekeeping_mask) < 0) { pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + goto free_non_housekeeping_mask; } - alloc_bootmem_cpumask_var(&tmp); - if (!housekeeping_flags) { - alloc_bootmem_cpumask_var(&housekeeping_mask); - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, non_housekeeping_mask); + alloc_bootmem_cpumask_var(&housekeeping_staging); + cpumask_andnot(housekeeping_staging, + cpu_possible_mask, non_housekeeping_mask); - cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); - if (cpumask_empty(tmp)) { + if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) { + __cpumask_set_cpu(smp_processor_id(), housekeeping_staging); + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); + if (!housekeeping.flags) { pr_warn("Housekeeping: must include one present CPU, " "using boot CPU:%d\n", smp_processor_id()); - __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); - __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); - } - } else { - cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); - if (cpumask_empty(tmp)) - __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); - cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); - if (!cpumask_equal(tmp, housekeeping_mask)) { - pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); - free_bootmem_cpumask_var(tmp); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; } } - free_bootmem_cpumask_var(tmp); - if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { - if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { - tick_nohz_full_setup(non_housekeeping_mask); - } else { - pr_warn("Housekeeping: nohz unsupported." - " Build with CONFIG_NO_HZ_FULL\n"); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + if (!housekeeping.flags) { + /* First setup call ("nohz_full=" or "isolcpus=") */ + enum hk_type type; + + for_each_set_bit(type, &flags, HK_TYPE_MAX) + housekeeping_setup_type(type, housekeeping_staging); + } else { + /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */ + enum hk_type type; + unsigned long iter_flags = flags & housekeeping.flags; + + for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) { + if (!cpumask_equal(housekeeping_staging, + housekeeping.cpumasks[type])) { + pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); + goto free_housekeeping_staging; + } } + + iter_flags = flags & ~housekeeping.flags; + + for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) + housekeeping_setup_type(type, housekeeping_staging); } - housekeeping_flags |= flags; + if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) + tick_nohz_full_setup(non_housekeeping_mask); + + housekeeping.flags |= flags; + err = 1; +free_housekeeping_staging: + free_bootmem_cpumask_var(housekeeping_staging); +free_non_housekeeping_mask: free_bootmem_cpumask_var(non_housekeeping_mask); - return 1; + return err; } static int __init housekeeping_nohz_full_setup(char *str) { - unsigned int flags; + unsigned long flags; flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC | HK_FLAG_KTHREAD; @@ -147,7 +190,7 @@ __setup("nohz_full=", housekeeping_nohz_full_setup); static int __init housekeeping_isolcpus_setup(char *str) { - unsigned int flags = 0; + unsigned long flags = 0; bool illegal = false; char *par; int len; diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 954b229868d9..52c8f8226b0d 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,7 +6,6 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ -#include "sched.h" /* * Global load-average calculations diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index b5add64d9698..0c5be7ebb1dc 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,7 +4,6 @@ * * membarrier system call */ -#include "sched.h" /* * For documentation purposes, here are some membarrier ordering @@ -147,11 +146,11 @@ #endif #ifdef CONFIG_RSEQ -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ) #else -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 #endif #define MEMBARRIER_CMD_BITMASK \ @@ -159,7 +158,8 @@ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ - | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK) static void ipi_mb(void *info) { diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a554e3bbab2b..0f310768260c 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -24,10 +24,6 @@ * Author: Vincent Guittot <vincent.guittot@linaro.org> */ -#include <linux/sched.h> -#include "sched.h" -#include "pelt.h" - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index e06071bf3472..c336f5f481bc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024) + static inline u32 get_pelt_divider(struct sched_avg *avg) { - return LOAD_AVG_MAX - 1024 + avg->period_contrib; + return PELT_MIN_DIVIDER + avg->period_contrib; } static inline void cfs_se_util_change(struct sched_avg *avg) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a679613a7cb7..a4fa3aadfcba 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -137,21 +137,6 @@ * sampling of the aggregate task states would be. */ -#include "../workqueue_internal.h" -#include <linux/sched/loadavg.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/seqlock.h> -#include <linux/uaccess.h> -#include <linux/cgroup.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/ctype.h> -#include <linux/file.h> -#include <linux/poll.h> -#include <linux/psi.h> -#include "sched.h" - static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); @@ -523,7 +508,7 @@ static void init_triggers(struct psi_group *group, u64 now) static u64 update_triggers(struct psi_group *group, u64 now) { struct psi_trigger *t; - bool new_stall = false; + bool update_total = false; u64 *total = group->total[PSI_POLL]; /* @@ -532,24 +517,35 @@ static u64 update_triggers(struct psi_group *group, u64 now) */ list_for_each_entry(t, &group->triggers, node) { u64 growth; + bool new_stall; - /* Check for stall activity */ - if (group->polling_total[t->state] == total[t->state]) - continue; + new_stall = group->polling_total[t->state] != total[t->state]; + /* Check for stall activity or a previous threshold breach */ + if (!new_stall && !t->pending_event) + continue; /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. + * Check for new stall activity, as well as deferred + * events that occurred in the last window after the + * trigger had already fired (we want to ratelimit + * events without dropping any). */ - new_stall = true; - - /* Calculate growth since last update */ - growth = window_update(&t->win, now, total[t->state]); - if (growth < t->threshold) - continue; - + if (new_stall) { + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + update_total = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + t->pending_event = true; + } /* Limit event signaling to once per window */ if (now < t->last_event_time + t->win.size) continue; @@ -558,9 +554,11 @@ static u64 update_triggers(struct psi_group *group, u64 now) if (cmpxchg(&t->event, 0, 1) == 0) wake_up_interruptible(&t->event_wait); t->last_event_time = now; + /* Reset threshold breach flag once event got generated */ + t->pending_event = false; } - if (new_stall) + if (update_total) memcpy(group->polling_total, total, sizeof(group->polling_total)); @@ -1082,44 +1080,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) -{ - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return single_open(file, psi_show, NULL); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_io_show); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_memory_show); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_cpu_show); -} - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { @@ -1162,7 +1122,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); + t->pending_event = false; mutex_lock(&group->trigger_lock); @@ -1191,15 +1151,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1235,9 +1199,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1254,18 +1218,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1275,27 +1227,57 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_io_show); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_memory_show); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_cpu_show); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } @@ -1400,3 +1392,5 @@ static int __init psi_proc_init(void) return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7b4f4fbbb404..a32c46889af8 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,9 +3,6 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ -#include "sched.h" - -#include "pelt.h" int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -271,8 +268,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP -static void pull_rt_task(struct rq *this_rq); - static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ @@ -429,15 +424,6 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { } -static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) -{ - return false; -} - -static inline void pull_rt_task(struct rq *this_rq) -{ -} - static inline void rt_queue_push_tasks(struct rq *rq) { } @@ -1730,8 +1716,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f rt_queue_push_tasks(rq); } -static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, - struct rt_rq *rt_rq) +static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) { struct rt_prio_array *array = &rt_rq->active; struct sched_rt_entity *next = NULL; @@ -1753,7 +1738,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; do { - rt_se = pick_next_rt_entity(rq, rt_rq); + rt_se = pick_next_rt_entity(rt_rq); BUG_ON(!rt_se); rt_rq = group_rt_rq(rt_se); } while (rt_rq); @@ -2026,6 +2011,16 @@ static int push_rt_task(struct rq *rq, bool pull) return 0; retry: + /* + * It's possible that the next_task slipped in of + * higher priority than current. If that's the case + * just reschedule current. + */ + if (unlikely(next_task->prio < rq->curr->prio)) { + resched_curr(rq); + return 0; + } + if (is_migration_disabled(next_task)) { struct task_struct *push_task = NULL; int cpu; @@ -2033,6 +2028,18 @@ retry: if (!pull || rq->push_busy) return 0; + /* + * Invoking find_lowest_rq() on anything but an RT task doesn't + * make sense. Per the above priority check, curr has to + * be of higher priority than next_task, so no need to + * reschedule when bailing out. + * + * Note that the stoppers are masqueraded as SCHED_FIFO + * (cf. sched_set_stop_task()), so we can't rely on rt_task(). + */ + if (rq->curr->sched_class != &rt_sched_class) + return 0; + cpu = find_lowest_rq(rq->curr); if (cpu == -1 || cpu == rq->cpu) return 0; @@ -2057,16 +2064,6 @@ retry: if (WARN_ON(next_task == rq->curr)) return 0; - /* - * It's possible that the next_task slipped in of - * higher priority than current. If that's the case - * just reschedule current. - */ - if (unlikely(next_task->prio < rq->curr->prio)) { - resched_curr(rq); - return 0; - } - /* We might release rq lock */ get_task_struct(next_task); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index de53be905739..58263f90c559 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,86 +2,98 @@ /* * Scheduler internal types and methods: */ -#include <linux/sched.h> +#ifndef _KERNEL_SCHED_SCHED_H +#define _KERNEL_SCHED_SCHED_H +#include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> -#include <linux/sched/clock.h> -#include <linux/sched/coredump.h> #include <linux/sched/cpufreq.h> -#include <linux/sched/cputime.h> #include <linux/sched/deadline.h> -#include <linux/sched/debug.h> -#include <linux/sched/hotplug.h> -#include <linux/sched/idle.h> -#include <linux/sched/init.h> -#include <linux/sched/isolation.h> -#include <linux/sched/jobctl.h> +#include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/sched/mm.h> -#include <linux/sched/nohz.h> -#include <linux/sched/numa_balancing.h> -#include <linux/sched/prio.h> -#include <linux/sched/rt.h> +#include <linux/sched/rseq_api.h> #include <linux/sched/signal.h> #include <linux/sched/smt.h> #include <linux/sched/stat.h> #include <linux/sched/sysctl.h> +#include <linux/sched/task_flags.h> #include <linux/sched/task.h> -#include <linux/sched/task_stack.h> #include <linux/sched/topology.h> -#include <linux/sched/user.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/xacct.h> -#include <uapi/linux/sched/types.h> - -#include <linux/binfmts.h> -#include <linux/bitops.h> -#include <linux/compat.h> -#include <linux/context_tracking.h> +#include <linux/atomic.h> +#include <linux/bitmap.h> +#include <linux/bug.h> +#include <linux/capability.h> +#include <linux/cgroup_api.h> +#include <linux/cgroup.h> #include <linux/cpufreq.h> -#include <linux/cpuidle.h> -#include <linux/cpuset.h> +#include <linux/cpumask_api.h> #include <linux/ctype.h> -#include <linux/debugfs.h> -#include <linux/delayacct.h> -#include <linux/energy_model.h> -#include <linux/init_task.h> -#include <linux/kprobes.h> +#include <linux/file.h> +#include <linux/fs_api.h> +#include <linux/hrtimer_api.h> +#include <linux/interrupt.h> +#include <linux/irq_work.h> +#include <linux/jiffies.h> +#include <linux/kref_api.h> #include <linux/kthread.h> -#include <linux/membarrier.h> -#include <linux/migrate.h> -#include <linux/mmu_context.h> -#include <linux/nmi.h> +#include <linux/ktime_api.h> +#include <linux/lockdep_api.h> +#include <linux/lockdep.h> +#include <linux/minmax.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/mutex_api.h> +#include <linux/plist.h> +#include <linux/poll.h> #include <linux/proc_fs.h> -#include <linux/prefetch.h> #include <linux/profile.h> #include <linux/psi.h> -#include <linux/ratelimit.h> -#include <linux/rcupdate_wait.h> -#include <linux/security.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/seqlock.h> +#include <linux/softirq.h> +#include <linux/spinlock_api.h> +#include <linux/static_key.h> #include <linux/stop_machine.h> -#include <linux/suspend.h> -#include <linux/swait.h> +#include <linux/syscalls_api.h> #include <linux/syscalls.h> -#include <linux/task_work.h> -#include <linux/tsacct_kern.h> +#include <linux/tick.h> +#include <linux/topology.h> +#include <linux/types.h> +#include <linux/u64_stats_sync_api.h> +#include <linux/uaccess.h> +#include <linux/wait_api.h> +#include <linux/wait_bit.h> +#include <linux/workqueue_api.h> + +#include <trace/events/power.h> +#include <trace/events/sched.h> + +#include "../workqueue_internal.h" + +#ifdef CONFIG_CGROUP_SCHED +#include <linux/cgroup.h> +#include <linux/psi.h> +#endif -#include <asm/tlb.h> +#ifdef CONFIG_SCHED_DEBUG +# include <linux/static_key.h> +#endif #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> +# include <asm/paravirt_api_clock.h> #endif #include "cpupri.h" #include "cpudeadline.h" -#include <trace/events/sched.h> - #ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif struct rq; @@ -301,29 +313,6 @@ struct dl_bw { u64 total_bw; }; -static inline void __dl_update(struct dl_bw *dl_b, s64 bw); - -static inline -void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) -{ - dl_b->total_bw -= tsk_bw; - __dl_update(dl_b, (s32)tsk_bw / cpus); -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) -{ - dl_b->total_bw += tsk_bw; - __dl_update(dl_b, -((s32)tsk_bw / cpus)); -} - -static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap, - u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; -} - /* * Verify the fitness of task @p to run on @cpu taking into account the * CPU original capacity and the runtime/deadline ratio of the task. @@ -347,15 +336,11 @@ extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); -extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern bool dl_cpu_busy(unsigned int cpu); +extern int dl_cpu_busy(int cpu, struct task_struct *p); #ifdef CONFIG_CGROUP_SCHED -#include <linux/cgroup.h> -#include <linux/psi.h> - struct cfs_rq; struct rt_rq; @@ -1662,12 +1647,14 @@ enum numa_topology_type { extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); -extern void sched_init_numa(void); +extern void sched_init_numa(int offline_node); +extern void sched_update_numa(int cpu, bool online); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); #else -static inline void sched_init_numa(void) { } +static inline void sched_init_numa(int offline_node) { } +static inline void sched_update_numa(int cpu, bool online) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) @@ -1854,7 +1841,6 @@ static inline void flush_smp_call_function_from_idle(void) { } #endif #include "stats.h" -#include "autogroup.h" #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) @@ -1950,7 +1936,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include <linux/static_key.h> # define const_debug __read_mostly #else # define const_debug const @@ -2331,7 +2316,6 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); -extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); @@ -2747,32 +2731,6 @@ extern void nohz_run_idle_balance(int cpu); static inline void nohz_run_idle_balance(int cpu) { } #endif -#ifdef CONFIG_SMP -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); - int i; - - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - for_each_cpu_and(i, rd->span, cpu_active_mask) { - struct rq *rq = cpu_rq(i); - - rq->dl.extra_bw += bw; - } -} -#else -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); - - dl->extra_bw += bw; -} -#endif - - #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { u64 total; @@ -2841,88 +2799,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ -#ifdef CONFIG_UCLAMP_TASK -unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); - -/** - * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. - * @rq: The rq to clamp against. Must not be NULL. - * @util: The util value to clamp. - * @p: The task to clamp against. Can be NULL if you want to clamp - * against @rq only. - * - * Clamps the passed @util to the max(@rq, @p) effective uclamp values. - * - * If sched_uclamp_used static key is disabled, then just return the util - * without any clamping since uclamp aggregation at the rq level in the fast - * path is disabled, rendering this operation a NOP. - * - * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It - * will return the correct effective uclamp value of the task even if the - * static key is disabled. - */ -static __always_inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - unsigned long min_util = 0; - unsigned long max_util = 0; - - if (!static_branch_likely(&sched_uclamp_used)) - return util; - - if (p) { - min_util = uclamp_eff_value(p, UCLAMP_MIN); - max_util = uclamp_eff_value(p, UCLAMP_MAX); - - /* - * Ignore last runnable task's max clamp, as this task will - * reset it. Similarly, no need to read the rq's min clamp. - */ - if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) - goto out; - } - - min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value)); - max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value)); -out: - /* - * Since CPU's {min,max}_util clamps are MAX aggregated considering - * RUNNABLE tasks with _different_ clamps, we can end up with an - * inversion. Fix it now when the clamps are applied. - */ - if (unlikely(min_util >= max_util)) - return min_util; - - return clamp(util, min_util, max_util); -} - -/* - * When uclamp is compiled in, the aggregation at rq level is 'turned off' - * by default in the fast path and only gets turned on once userspace performs - * an operation that requires it. - * - * Returns true if userspace opted-in to use uclamp and aggregation at rq level - * hence is active. - */ -static inline bool uclamp_is_used(void) -{ - return static_branch_likely(&sched_uclamp_used); -} -#else /* CONFIG_UCLAMP_TASK */ -static inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - return util; -} - -static inline bool uclamp_is_used(void) -{ - return false; -} -#endif /* CONFIG_UCLAMP_TASK */ - #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant # define arch_scale_freq_invariant() true @@ -3020,6 +2896,105 @@ static inline unsigned long cpu_util_rt(struct rq *rq) } #endif +#ifdef CONFIG_UCLAMP_TASK +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); + +/** + * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ +static __always_inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) +{ + unsigned long min_util = 0; + unsigned long max_util = 0; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + if (p) { + min_util = uclamp_eff_value(p, UCLAMP_MIN); + max_util = uclamp_eff_value(p, UCLAMP_MAX); + + /* + * Ignore last runnable task's max clamp, as this task will + * reset it. Similarly, no need to read the rq's min clamp. + */ + if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) + goto out; + } + + min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value)); + max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value)); +out: + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); +} + +/* Is the rq being capped/throttled by uclamp_max? */ +static inline bool uclamp_rq_is_capped(struct rq *rq) +{ + unsigned long rq_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return false; + + rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; +} + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) +{ + return util; +} + +static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } + +static inline bool uclamp_is_used(void) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK */ + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { @@ -3118,3 +3093,4 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +#endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 07dde2928c79..857f837f52cb 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,7 +2,6 @@ /* * /proc/schedstat implementation */ -#include "sched.h" void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 3a3c826dd83a..baa839c1ba96 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_STATS_H +#define _KERNEL_STATS_H #ifdef CONFIG_SCHEDSTATS @@ -298,3 +300,5 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n # define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ + +#endif /* _KERNEL_STATS_H */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 0b165a25f22f..d04073a93eb4 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,7 +7,6 @@ * * See kernel/stop_machine.c */ -#include "sched.h" #ifdef CONFIG_SMP static int diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e1c655f928c7..76b9b796e695 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,7 +2,6 @@ /* * <linux/swait.h> (simple wait queues ) implementation: */ -#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d201a7052a29..810750e62118 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2,7 +2,6 @@ /* * Scheduler topology setup/handling methods */ -#include "sched.h" DEFINE_MUTEX(sched_domains_mutex); @@ -74,7 +73,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_span(group))) { + if (cpumask_empty(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; @@ -1366,7 +1365,7 @@ static void asym_cpu_capacity_scan(void) list_for_each_entry(entry, &asym_cap_list, link) cpumask_clear(cpu_capacity_span(entry)); - for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN)) + for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) asym_cpu_capacity_update_data(cpu); list_for_each_entry_safe(entry, next, &asym_cap_list, link) { @@ -1492,8 +1491,6 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; - -static unsigned long __read_mostly *sched_numa_onlined_nodes; #endif /* @@ -1651,6 +1648,7 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology_saved; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) @@ -1661,6 +1659,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) return; sched_domain_topology = tl; + sched_domain_topology_saved = NULL; } #ifdef CONFIG_NUMA @@ -1684,8 +1683,12 @@ static void sched_numa_warn(const char *str) for (i = 0; i < nr_node_ids; i++) { printk(KERN_WARNING " "); - for (j = 0; j < nr_node_ids; j++) - printk(KERN_CONT "%02d ", node_distance(i,j)); + for (j = 0; j < nr_node_ids; j++) { + if (!node_state(i, N_CPU) || !node_state(j, N_CPU)) + printk(KERN_CONT "(%02d) ", node_distance(i,j)); + else + printk(KERN_CONT " %02d ", node_distance(i,j)); + } printk(KERN_CONT "\n"); } printk(KERN_WARNING "\n"); @@ -1693,19 +1696,34 @@ static void sched_numa_warn(const char *str) bool find_numa_distance(int distance) { - int i; + bool found = false; + int i, *distances; if (distance == node_distance(0, 0)) return true; + rcu_read_lock(); + distances = rcu_dereference(sched_domains_numa_distance); + if (!distances) + goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { - if (sched_domains_numa_distance[i] == distance) - return true; + if (distances[i] == distance) { + found = true; + break; + } } +unlock: + rcu_read_unlock(); - return false; + return found; } +#define for_each_cpu_node_but(n, nbut) \ + for_each_node_state(n, N_CPU) \ + if (n == nbut) \ + continue; \ + else + /* * A system can have three types of NUMA topology: * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system @@ -1725,7 +1743,7 @@ bool find_numa_distance(int distance) * there is an intermediary node C, which is < N hops away from both * nodes A and B, the system is a glueless mesh. */ -static void init_numa_topology_type(void) +static void init_numa_topology_type(int offline_node) { int a, b, c, n; @@ -1736,14 +1754,14 @@ static void init_numa_topology_type(void) return; } - for_each_online_node(a) { - for_each_online_node(b) { + for_each_cpu_node_but(a, offline_node) { + for_each_cpu_node_but(b, offline_node) { /* Find two nodes furthest removed from each other. */ if (node_distance(a, b) < n) continue; /* Is there an intermediary node between a and b? */ - for_each_online_node(c) { + for_each_cpu_node_but(c, offline_node) { if (node_distance(a, c) < n && node_distance(b, c) < n) { sched_numa_topology_type = @@ -1756,17 +1774,22 @@ static void init_numa_topology_type(void) return; } } + + pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n"); + sched_numa_topology_type = NUMA_DIRECT; } #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) -void sched_init_numa(void) +void sched_init_numa(int offline_node) { struct sched_domain_topology_level *tl; unsigned long *distance_map; int nr_levels = 0; int i, j; + int *distances; + struct cpumask ***masks; /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the @@ -1777,12 +1800,13 @@ void sched_init_numa(void) return; bitmap_zero(distance_map, NR_DISTANCE_VALUES); - for (i = 0; i < nr_node_ids; i++) { - for (j = 0; j < nr_node_ids; j++) { + for_each_cpu_node_but(i, offline_node) { + for_each_cpu_node_but(j, offline_node) { int distance = node_distance(i, j); if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { sched_numa_warn("Invalid distance value range"); + bitmap_free(distance_map); return; } @@ -1795,16 +1819,17 @@ void sched_init_numa(void) */ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); - if (!sched_domains_numa_distance) { + distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!distances) { bitmap_free(distance_map); return; } for (i = 0, j = 0; i < nr_levels; i++, j++) { j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); - sched_domains_numa_distance[i] = j; + distances[i] = j; } + rcu_assign_pointer(sched_domains_numa_distance, distances); bitmap_free(distance_map); @@ -1826,8 +1851,8 @@ void sched_init_numa(void) */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); - if (!sched_domains_numa_masks) + masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); + if (!masks) return; /* @@ -1835,31 +1860,20 @@ void sched_init_numa(void) * CPUs of nodes that are that many hops away from us. */ for (i = 0; i < nr_levels; i++) { - sched_domains_numa_masks[i] = - kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); - if (!sched_domains_numa_masks[i]) + masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!masks[i]) return; - for (j = 0; j < nr_node_ids; j++) { + for_each_cpu_node_but(j, offline_node) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); int k; if (!mask) return; - sched_domains_numa_masks[i][j] = mask; - - for_each_node(k) { - /* - * Distance information can be unreliable for - * offline nodes, defer building the node - * masks to its bringup. - * This relies on all unique distance values - * still being visible at init time. - */ - if (!node_online(j)) - continue; + masks[i][j] = mask; + for_each_cpu_node_but(k, offline_node) { if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); @@ -1870,6 +1884,7 @@ void sched_init_numa(void) } } } + rcu_assign_pointer(sched_domains_numa_masks, masks); /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); @@ -1907,59 +1922,67 @@ void sched_init_numa(void) }; } + sched_domain_topology_saved = sched_domain_topology; sched_domain_topology = tl; sched_domains_numa_levels = nr_levels; - sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; + WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); - init_numa_topology_type(); - - sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL); - if (!sched_numa_onlined_nodes) - return; - - bitmap_zero(sched_numa_onlined_nodes, nr_node_ids); - for_each_online_node(i) - bitmap_set(sched_numa_onlined_nodes, i, 1); + init_numa_topology_type(offline_node); } -static void __sched_domains_numa_masks_set(unsigned int node) -{ - int i, j; - - /* - * NUMA masks are not built for offline nodes in sched_init_numa(). - * Thus, when a CPU of a never-onlined-before node gets plugged in, - * adding that new CPU to the right NUMA masks is not sufficient: the - * masks of that CPU's node must also be updated. - */ - if (test_bit(node, sched_numa_onlined_nodes)) - return; - bitmap_set(sched_numa_onlined_nodes, node, 1); - - for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) { - if (!node_online(j) || node == j) - continue; +static void sched_reset_numa(void) +{ + int nr_levels, *distances; + struct cpumask ***masks; - if (node_distance(j, node) > sched_domains_numa_distance[i]) + nr_levels = sched_domains_numa_levels; + sched_domains_numa_levels = 0; + sched_max_numa_distance = 0; + sched_numa_topology_type = NUMA_DIRECT; + distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_domains_numa_distance, NULL); + masks = sched_domains_numa_masks; + rcu_assign_pointer(sched_domains_numa_masks, NULL); + if (distances || masks) { + int i, j; + + synchronize_rcu(); + kfree(distances); + for (i = 0; i < nr_levels && masks; i++) { + if (!masks[i]) continue; - - /* Add remote nodes in our masks */ - cpumask_or(sched_domains_numa_masks[i][node], - sched_domains_numa_masks[i][node], - sched_domains_numa_masks[0][j]); + for_each_node(j) + kfree(masks[i][j]); + kfree(masks[i]); } + kfree(masks); + } + if (sched_domain_topology_saved) { + kfree(sched_domain_topology); + sched_domain_topology = sched_domain_topology_saved; + sched_domain_topology_saved = NULL; } +} +/* + * Call with hotplug lock held + */ +void sched_update_numa(int cpu, bool online) +{ + int node; + + node = cpu_to_node(cpu); /* - * A new node has been brought up, potentially changing the topology - * classification. - * - * Note that this is racy vs any use of sched_numa_topology_type :/ + * Scheduler NUMA topology is updated when the first CPU of a + * node is onlined or the last CPU of a node is offlined. */ - init_numa_topology_type(); + if (cpumask_weight(cpumask_of_node(node)) != 1) + return; + + sched_reset_numa(); + sched_init_numa(online ? NUMA_NO_NODE : node); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1967,11 +1990,9 @@ void sched_domains_numa_masks_set(unsigned int cpu) int node = cpu_to_node(cpu); int i, j; - __sched_domains_numa_masks_set(node); - for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { - if (!node_online(j)) + if (!node_state(j, N_CPU)) continue; /* Set ourselves in the remote node's masks */ @@ -1986,8 +2007,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu) int i, j; for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) - cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + for (j = 0; j < nr_node_ids; j++) { + if (sched_domains_numa_masks[i][j]) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } } } @@ -2001,14 +2024,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu) */ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) { - int i, j = cpu_to_node(cpu); + int i, j = cpu_to_node(cpu), found = nr_cpu_ids; + struct cpumask ***masks; + rcu_read_lock(); + masks = rcu_dereference(sched_domains_numa_masks); + if (!masks) + goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { - cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]); - if (cpu < nr_cpu_ids) - return cpu; + if (!masks[i][j]) + break; + cpu = cpumask_any_and(cpus, masks[i][j]); + if (cpu < nr_cpu_ids) { + found = cpu; + break; + } } - return nr_cpu_ids; +unlock: + rcu_read_unlock(); + + return found; } #endif /* CONFIG_NUMA */ @@ -2242,6 +2277,57 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + /* + * Calculate an allowed NUMA imbalance such that LLCs do not get + * imbalanced. + */ + for_each_cpu(i, cpu_map) { + unsigned int imb = 0; + unsigned int imb_span = 1; + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + struct sched_domain *child = sd->child; + + if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child && + (child->flags & SD_SHARE_PKG_RESOURCES)) { + struct sched_domain __rcu *top_p; + unsigned int nr_llcs; + + /* + * For a single LLC per node, allow an + * imbalance up to 25% of the node. This is an + * arbitrary cutoff based on SMT-2 to balance + * between memory bandwidth and avoiding + * premature sharing of HT resources and SMT-4 + * or SMT-8 *may* benefit from a different + * cutoff. + * + * For multiple LLCs, allow an imbalance + * until multiple tasks would share an LLC + * on one node while LLCs on another node + * remain idle. + */ + nr_llcs = sd->span_weight / child->span_weight; + if (nr_llcs == 1) + imb = sd->span_weight >> 2; + else + imb = nr_llcs; + sd->imb_numa_nr = imb; + + /* Set span based on the first NUMA domain. */ + top_p = sd->parent; + while (top_p && !(top_p->flags & SD_NUMA)) { + top_p = top_p->parent; + } + imb_span = top_p ? top_p->span_weight : sd->span_weight; + } else { + int factor = max(1U, (sd->span_weight / imb_span)); + + sd->imb_numa_nr = imb * factor; + } + } + } + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) @@ -2351,7 +2437,7 @@ int sched_init_domains(const struct cpumask *cpu_map) doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) doms_cur = &fallback_doms; - cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); err = build_sched_domains(doms_cur[0], NULL); return err; @@ -2440,7 +2526,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], if (doms_new) { n = 1; cpumask_and(doms_new[0], cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); } } else { n = ndoms_new; @@ -2475,7 +2561,7 @@ match1: n = 0; doms_new = &fallback_doms; cpumask_and(doms_new[0], cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); } /* Build new domains: */ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index eca38107b32f..9860bb9a847c 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,7 +4,6 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ -#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 02ce292b9bc0..d4788f810b55 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only + /* * The implementation of the wait_bit*() and related waiting APIs: */ -#include "sched.h" #define WAIT_TABLE_BITS 8 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) diff --git a/kernel/scs.c b/kernel/scs.c index 579841be8864..b7e1b096d906 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -32,15 +32,19 @@ static void *__scs_alloc(int node) for (i = 0; i < NR_CACHED_SCS; i++) { s = this_cpu_xchg(scs_cache[i], NULL); if (s) { - kasan_unpoison_vmalloc(s, SCS_SIZE); + s = kasan_unpoison_vmalloc(s, SCS_SIZE, + KASAN_VMALLOC_PROT_NORMAL); memset(s, 0, SCS_SIZE); - return s; + goto out; } } - return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, + s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, GFP_SCS, PAGE_KERNEL, 0, node, __builtin_return_address(0)); + +out: + return kasan_reset_tag(s); } void *scs_alloc(int node) @@ -78,7 +82,7 @@ void scs_free(void *s) if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) return; - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_PROT_NORMAL); vfree_atomic(s); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4d8f44a17727..db10e73d06e0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -29,6 +29,9 @@ #include <linux/syscalls.h> #include <linux/sysctl.h> +/* Not exposed in headers: strictly internal use only. */ +#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) + #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include <asm/syscall.h> #endif @@ -1010,6 +1013,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } @@ -1261,6 +1265,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_KILL_PROCESS: default: + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || @@ -1309,6 +1314,11 @@ int __secure_computing(const struct seccomp_data *sd) return 0; case SECCOMP_MODE_FILTER: return __seccomp_filter(this_syscall, sd, false); + /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ + case SECCOMP_MODE_DEAD: + WARN_ON_ONCE(1); + do_exit(SIGKILL); + return -1; default: BUG(); } diff --git a/kernel/signal.c b/kernel/signal.c index dfcee3888b00..e93de6daa188 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -626,7 +626,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, * * All callers have to hold the siglock. */ -int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info) +int dequeue_signal(struct task_struct *tsk, sigset_t *mask, + kernel_siginfo_t *info, enum pid_type *type) { bool resched_timer = false; int signr; @@ -634,8 +635,10 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ + *type = PIDTYPE_PID; signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { + *type = PIDTYPE_TGID; signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info, &resched_timer); #ifdef CONFIG_POSIX_TIMERS @@ -903,8 +906,8 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) struct task_struct *t; sigset_t flush; - if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { - if (!(signal->flags & SIGNAL_GROUP_EXIT)) + if (signal->flags & SIGNAL_GROUP_EXIT) { + if (signal->core_state) return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. @@ -1029,7 +1032,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && - !(signal->flags & SIGNAL_GROUP_EXIT) && + (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !p->ptrace)) { /* @@ -1305,6 +1308,43 @@ enum sig_handler { }; /* + * On some archictectures, PREEMPT_RT has to delay sending a signal from a + * trap since it cannot enable preemption, and the signal code's + * spin_locks turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME + * which will send the signal on exit of the trap. + */ +#ifdef CONFIG_RT_DELAYED_SIGNALS +static inline bool force_sig_delayed(struct kernel_siginfo *info, + struct task_struct *t) +{ + if (!in_atomic()) + return false; + + if (WARN_ON_ONCE(t->forced_info.si_signo)) + return true; + + if (is_si_special(info)) { + WARN_ON_ONCE(info != SEND_SIG_PRIV); + t->forced_info.si_signo = info->si_signo; + t->forced_info.si_errno = 0; + t->forced_info.si_code = SI_KERNEL; + t->forced_info.si_pid = 0; + t->forced_info.si_uid = 0; + } else { + t->forced_info = *info; + } + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + return true; +} +#else +static inline bool force_sig_delayed(struct kernel_siginfo *info, + struct task_struct *t) +{ + return false; +} +#endif + +/* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. * @@ -1324,6 +1364,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, struct k_sigaction *action; int sig = info->si_signo; + if (force_sig_delayed(info, t)) + return 0; + spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; @@ -1339,9 +1382,10 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect - * debugging to leave init killable. + * debugging to leave init killable. But HANDLER_EXIT is always fatal. */ - if (action->sa.sa_handler == SIG_DFL && !t->ptrace) + if (action->sa.sa_handler == SIG_DFL && + (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal(sig, info, t, PIDTYPE_PID); spin_unlock_irqrestore(&t->sighand->siglock, flags); @@ -1820,6 +1864,7 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) * force_sig_seccomp - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland * @reason: filter-supplied reason code to send to userland (via si_errno) + * @force_coredump: true to trigger a coredump * * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. */ @@ -2383,7 +2428,8 @@ static bool do_signal_stop(int signr) WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || - unlikely(signal_group_exit(sig))) + unlikely(sig->flags & SIGNAL_GROUP_EXIT) || + unlikely(sig->group_exec_task)) return false; /* * There is no group stop already in progress. We must @@ -2544,7 +2590,7 @@ static void do_freezer_trap(void) freezable_schedule(); } -static int ptrace_signal(int signr, kernel_siginfo_t *info) +static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) { /* * We do not check sig_kernel_stop(signr) but set this marker @@ -2584,8 +2630,9 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info) } /* If the (new) signal is now blocked, requeue it. */ - if (sigismember(¤t->blocked, signr)) { - send_signal(signr, info, current, PIDTYPE_PID); + if (sigismember(¤t->blocked, signr) || + fatal_signal_pending(current)) { + send_signal(signr, info, current, type); signr = 0; } @@ -2684,18 +2731,20 @@ relock: goto relock; } - /* Has this task already been marked for death? */ - if (signal_group_exit(signal)) { - ksig->info.si_signo = signr = SIGKILL; - sigdelset(¤t->pending.signal, SIGKILL); - trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, - &sighand->action[SIGKILL - 1]); - recalc_sigpending(); - goto fatal; - } - for (;;) { struct k_sigaction *ka; + enum pid_type type; + + /* Has this task already been marked for death? */ + if ((signal->flags & SIGNAL_GROUP_EXIT) || + signal->group_exec_task) { + ksig->info.si_signo = signr = SIGKILL; + sigdelset(¤t->pending.signal, SIGKILL); + trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, + &sighand->action[SIGKILL - 1]); + recalc_sigpending(); + goto fatal; + } if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && do_signal_stop(0)) @@ -2728,16 +2777,18 @@ relock: * so that the instruction pointer in the signal stack * frame points to the faulting instruction. */ + type = PIDTYPE_PID; signr = dequeue_synchronous_signal(&ksig->info); if (!signr) - signr = dequeue_signal(current, ¤t->blocked, &ksig->info); + signr = dequeue_signal(current, ¤t->blocked, + &ksig->info, &type); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && (signr != SIGKILL) && !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) { - signr = ptrace_signal(signr, &ksig->info); + signr = ptrace_signal(signr, &ksig->info, type); if (!signr) continue; } @@ -2863,13 +2914,13 @@ out: } /** - * signal_delivered - + * signal_delivered - called after signal delivery to update blocked signals * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask - * is always blocked, and the signal itself is blocked unless %SA_NODEFER + * is always blocked), and the signal itself is blocked unless %SA_NODEFER * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ static void signal_delivered(struct ksignal *ksig, int stepping) @@ -2942,7 +2993,7 @@ void exit_signals(struct task_struct *tsk) */ cgroup_threadgroup_change_begin(tsk); - if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { + if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -3562,6 +3613,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; sigset_t mask = *which; + enum pid_type type; int sig, ret = 0; if (ts) { @@ -3578,7 +3630,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, signotset(&mask); spin_lock_irq(&tsk->sighand->siglock); - sig = dequeue_signal(tsk, &mask, info); + sig = dequeue_signal(tsk, &mask, info, &type); if (!sig && timeout) { /* * None ready, temporarily unblock those we're interested @@ -3597,7 +3649,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); - sig = dequeue_signal(tsk, &mask, info); + sig = dequeue_signal(tsk, &mask, info, &type); } spin_unlock_irq(&tsk->sighand->siglock); diff --git a/kernel/softirq.c b/kernel/softirq.c index 41f470929e99..fac801815554 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -222,7 +222,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) u32 pending; int curcnt; - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); local_irq_save(flags); @@ -305,7 +305,7 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); raw_local_irq_save(flags); /* @@ -352,14 +352,14 @@ static void __local_bh_enable(unsigned int cnt) */ void _local_bh_enable(void) { - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(_local_bh_enable); void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_disable(); @@ -618,7 +618,7 @@ static inline void tick_irq_exit(void) /* Make sure that timer wheel updates are propagated */ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { - if (!in_irq()) + if (!in_hardirq()) tick_nohz_irq_exit(); } #endif diff --git a/kernel/stackleak.c b/kernel/stackleak.c index ce161a8e8d97..ddb5a7f48d69 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -16,11 +16,13 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include <linux/jump_label.h> #include <linux/sysctl.h> +#include <linux/init.h> static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); -int stack_erasing_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +#ifdef CONFIG_SYSCTL +static int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); @@ -42,13 +44,33 @@ int stack_erasing_sysctl(struct ctl_table *table, int write, state ? "enabled" : "disabled"); return ret; } +static struct ctl_table stackleak_sysctls[] = { + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init stackleak_sysctls_init(void) +{ + register_sysctl_init("kernel", stackleak_sysctls); + return 0; +} +late_initcall(stackleak_sysctls_init); +#endif /* CONFIG_SYSCTL */ #define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) #else #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void notrace stackleak_erase(void) +asmlinkage void noinstr stackleak_erase(void) { /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ unsigned long kstack_ptr = current->lowest_stack; @@ -102,9 +124,8 @@ asmlinkage void notrace stackleak_erase(void) /* Reset the 'lowest_stack' value for the next syscall */ current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; } -NOKPROBE_SYMBOL(stackleak_erase); -void __used __no_caller_saved_registers notrace stackleak_track_stack(void) +void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) { unsigned long sp = current_stack_pointer; diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9c625257023d..9ed5ce989415 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -226,15 +226,12 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) .store = store, .size = size, }; - mm_segment_t fs; /* Trace user stack if not a kernel thread */ if (current->flags & PF_KTHREAD) return 0; - fs = force_uaccess_begin(); arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); - force_uaccess_end(fs); return c.len; } diff --git a/kernel/static_call.c b/kernel/static_call.c index 43ba0b1e0edb..f2b8baea35d2 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -503,6 +503,7 @@ long __static_call_return0(void) { return 0; } +EXPORT_SYMBOL_GPL(__static_call_return0); #ifdef CONFIG_STATIC_CALL_SELFTEST diff --git a/kernel/sys.c b/kernel/sys.c index 2450a9f33cb0..374f83e95239 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/utsname.h> #include <linux/mman.h> #include <linux/reboot.h> @@ -220,7 +221,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) niceval = MAX_NICE; rcu_read_lock(); - read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: if (who) @@ -235,9 +235,11 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) pgrp = find_vpid(who); else pgrp = task_pgrp(current); + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); @@ -249,16 +251,15 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) if (!user) goto out_unlock; /* No processes for this user */ } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); - } while_each_thread(g, p); + } if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } out_unlock: - read_unlock(&tasklist_lock); rcu_read_unlock(); out: return error; @@ -283,7 +284,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) return -EINVAL; rcu_read_lock(); - read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: if (who) @@ -301,11 +301,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); @@ -317,19 +319,18 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) if (!user) goto out_unlock; /* No processes for this user */ } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } - } while_each_thread(g, p); + } if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } out_unlock: - read_unlock(&tasklist_lock); rcu_read_unlock(); return retval; @@ -472,6 +473,16 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + free_uid(new->user); + new->user = new_user; + return 0; +} + +static void flag_nproc_exceeded(struct cred *new) +{ + if (new->ucounts == current_ucounts()) + return; + /* * We don't fail in case of NPROC limit excess here because too many * poorly written programs don't check set*uid() return code, assuming @@ -480,15 +491,10 @@ static int set_user(struct cred *new) * failure to the execve() stage. */ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && - new_user != INIT_USER && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) + new->user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; - - free_uid(new->user); - new->user = new_user; - return 0; } /* @@ -563,6 +569,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: @@ -625,6 +632,7 @@ long __sys_setuid(uid_t uid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: @@ -704,6 +712,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: @@ -1415,6 +1424,68 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) return errno; } +/* make sure you are allowed to change @tsk limits before calling this */ +static int do_prlimit(struct task_struct *tsk, unsigned int resource, + struct rlimit *new_rlim, struct rlimit *old_rlim) +{ + struct rlimit *rlim; + int retval = 0; + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + if (new_rlim) { + if (new_rlim->rlim_cur > new_rlim->rlim_max) + return -EINVAL; + if (resource == RLIMIT_NOFILE && + new_rlim->rlim_max > sysctl_nr_open) + return -EPERM; + } + + /* Holding a refcount on tsk protects tsk->signal from disappearing. */ + rlim = tsk->signal->rlim + resource; + task_lock(tsk->group_leader); + if (new_rlim) { + /* + * Keep the capable check against init_user_ns until cgroups can + * contain all limits. + */ + if (new_rlim->rlim_max > rlim->rlim_max && + !capable(CAP_SYS_RESOURCE)) + retval = -EPERM; + if (!retval) + retval = security_task_setrlimit(tsk, resource, new_rlim); + } + if (!retval) { + if (old_rlim) + *old_rlim = *rlim; + if (new_rlim) + *rlim = *new_rlim; + } + task_unlock(tsk->group_leader); + + /* + * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not + * infinite. In case of RLIM_INFINITY the posix CPU timer code + * ignores the rlimit. + */ + if (!retval && new_rlim && resource == RLIMIT_CPU && + new_rlim->rlim_cur != RLIM_INFINITY && + IS_ENABLED(CONFIG_POSIX_TIMERS)) { + /* + * update_rlimit_cpu can fail if the task is exiting, but there + * may be other tasks in the thread group that are not exiting, + * and they need their cpu timers adjusted. + * + * The group_leader is the last task to be released, so if we + * cannot update_rlimit_cpu on it, then the entire process is + * exiting and we do not need to update at all. + */ + update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur); + } + + return retval; +} + SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit value; @@ -1558,63 +1629,6 @@ static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) rlim->rlim_max = (unsigned long)rlim64->rlim_max; } -/* make sure you are allowed to change @tsk limits before calling this */ -int do_prlimit(struct task_struct *tsk, unsigned int resource, - struct rlimit *new_rlim, struct rlimit *old_rlim) -{ - struct rlimit *rlim; - int retval = 0; - - if (resource >= RLIM_NLIMITS) - return -EINVAL; - if (new_rlim) { - if (new_rlim->rlim_cur > new_rlim->rlim_max) - return -EINVAL; - if (resource == RLIMIT_NOFILE && - new_rlim->rlim_max > sysctl_nr_open) - return -EPERM; - } - - /* protect tsk->signal and tsk->sighand from disappearing */ - read_lock(&tasklist_lock); - if (!tsk->sighand) { - retval = -ESRCH; - goto out; - } - - rlim = tsk->signal->rlim + resource; - task_lock(tsk->group_leader); - if (new_rlim) { - /* Keep the capable check against init_user_ns until - cgroups can contain all limits */ - if (new_rlim->rlim_max > rlim->rlim_max && - !capable(CAP_SYS_RESOURCE)) - retval = -EPERM; - if (!retval) - retval = security_task_setrlimit(tsk, resource, new_rlim); - } - if (!retval) { - if (old_rlim) - *old_rlim = *rlim; - if (new_rlim) - *rlim = *new_rlim; - } - task_unlock(tsk->group_leader); - - /* - * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not - * infinite. In case of RLIM_INFINITY the posix CPU timer code - * ignores the rlimit. - */ - if (!retval && new_rlim && resource == RLIMIT_CPU && - new_rlim->rlim_cur != RLIM_INFINITY && - IS_ENABLED(CONFIG_POSIX_TIMERS)) - update_rlimit_cpu(tsk, new_rlim->rlim_cur); -out: - read_unlock(&tasklist_lock); - return retval; -} - /* rcu lock must be held */ static int check_prlimit_permission(struct task_struct *task, unsigned int flags) @@ -2278,15 +2292,16 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, { struct mm_struct *mm = current->mm; const char __user *uname; - char *name, *pch; + struct anon_vma_name *anon_name = NULL; int error; switch (opt) { case PR_SET_VMA_ANON_NAME: uname = (const char __user *)arg; if (uname) { - name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + char *name, *pch; + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); if (IS_ERR(name)) return PTR_ERR(name); @@ -2296,15 +2311,18 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, return -EINVAL; } } - } else { - /* Reset the name */ - name = NULL; + /* anon_vma has its own copy */ + anon_name = anon_vma_name_alloc(name); + kfree(name); + if (!anon_name) + return -ENOMEM; + } mmap_write_lock(mm); - error = madvise_set_anon_name(mm, addr, size, name); + error = madvise_set_anon_name(mm, addr, size, anon_name); mmap_write_unlock(mm); - kfree(name); + anon_vma_name_put(anon_name); break; default: error = -EINVAL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ef77be575d87..830aaf8ca08e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -20,7 +20,6 @@ */ #include <linux/module.h> -#include <linux/aio.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> @@ -50,7 +49,6 @@ #include <linux/times.h> #include <linux/limits.h> #include <linux/dcache.h> -#include <linux/dnotify.h> #include <linux/syscalls.h> #include <linux/vmstat.h> #include <linux/nfs_fs.h> @@ -58,19 +56,15 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/perf_event.h> -#include <linux/kprobes.h> -#include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/kmod.h> #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> -#include <linux/sched/coredump.h> #include <linux/kexec.h> #include <linux/bpf.h> #include <linux/mount.h> #include <linux/userfaultfd_k.h> -#include <linux/coredump.h> #include <linux/latencytop.h> #include <linux/pid.h> #include <linux/delayacct.h> @@ -97,65 +91,21 @@ #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) #include <linux/lockdep.h> #endif -#ifdef CONFIG_CHR_DEV_SG -#include <scsi/sg.h> -#endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE -#include <linux/stackleak.h> -#endif -#ifdef CONFIG_LOCKUP_DETECTOR -#include <linux/nmi.h> -#endif #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ -#ifdef CONFIG_LOCKUP_DETECTOR -static int sixty = 60; -#endif -static int __maybe_unused neg_one = -1; -static int __maybe_unused two = 2; -static int __maybe_unused four = 4; -static unsigned long zero_ul; -static unsigned long one_ul = 1; -static unsigned long long_max = LONG_MAX; -static int one_hundred = 100; -static int two_hundred = 200; -static int one_thousand = 1000; -static int three_thousand = 3000; -#ifdef CONFIG_PRINTK -static int ten_thousand = 10000; -#endif #ifdef CONFIG_PERF_EVENTS -static int six_hundred_forty_kb = 640 * 1024; +static const int six_hundred_forty_kb = 640 * 1024; #endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; - -/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ -static int maxolduid = 65535; -static int minolduid; +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -static int ngroups_max = NGROUPS_MAX; +static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/* - * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs - * and hung_task_check_interval_secs - */ -#ifdef CONFIG_DETECT_HUNG_TASK -static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); -#endif - -#ifdef CONFIG_INOTIFY_USER -#include <linux/inotify.h> -#endif -#ifdef CONFIG_FANOTIFY -#include <linux/fanotify.h> -#endif - #ifdef CONFIG_PROC_SYSCTL /** @@ -192,8 +142,8 @@ int sysctl_legacy_va_layout; #endif #ifdef CONFIG_COMPACTION -static int min_extfrag_threshold; -static int max_extfrag_threshold = 1000; +/* min_extfrag_threshold is SYSCTL_ZERO */; +static const int max_extfrag_threshold = 1000; #endif #endif /* CONFIG_SYSCTL */ @@ -230,6 +180,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write, return ret; } +void __weak unpriv_ebpf_notify(int new_state) +{ +} + static int bpf_unpriv_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -247,6 +201,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, return -EPERM; *(int *)table->data = unpriv_enable; } + + unpriv_ebpf_notify(unpriv_enable); + return ret; } #endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ @@ -804,12 +761,12 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); } -static int do_proc_douintvec(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) +int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) { return __do_proc_douintvec(table->data, table, write, buffer, lenp, ppos, conv, data); @@ -938,17 +895,6 @@ static int proc_taint(struct ctl_table *table, int write, return err; } -#ifdef CONFIG_PRINTK -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} -#endif - /** * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure * @min: pointer to minimum allowable value @@ -1144,67 +1090,6 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - if (write) { - unsigned int val; - - val = round_pipe_size(*lvalp); - if (val == 0) - return -EINVAL; - - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; - } - - return 0; -} - -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, NULL); -} - -static void validate_coredump_safety(void) -{ -#ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { - printk(KERN_WARNING -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); - } -#endif -} - -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} - -#ifdef CONFIG_COREDUMP -static int proc_dostring_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dostring(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} -#endif - #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -1267,10 +1152,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); - if (err) + if (err || neg) { + err = -EINVAL; break; - if (neg) - continue; + } + val = convmul * val / convdiv; if ((min && val < *min) || (max && val > *max)) { err = -EINVAL; @@ -1810,7 +1696,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_numa_balancing, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ { @@ -1871,17 +1757,6 @@ static struct ctl_table kern_table[] = { .proc_handler = sysctl_sched_uclamp_handler, }, #endif -#ifdef CONFIG_SCHED_AUTOGROUP - { - .procname = "sched_autogroup_enabled", - .data = &sysctl_sched_autogroup_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", @@ -1928,29 +1803,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_COREDUMP - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -1964,7 +1816,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, + .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif @@ -2131,15 +1983,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dostring, }, #endif -#ifdef CONFIG_CHR_DEV_SG - { - .procname = "sg-big-buff", - .data = &sg_big_buff, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_BSD_PROCESS_ACCT { .procname = "acct", @@ -2175,30 +2018,18 @@ static struct ctl_table kern_table[] = { .proc_handler = sysctl_max_threads, }, { - .procname = "random", - .mode = 0555, - .child = random_table, - }, - { .procname = "usermodehelper", .mode = 0555, .child = usermodehelper_table, }, -#ifdef CONFIG_FW_LOADER_USER_HELPER - { - .procname = "firmware_config", - .mode = 0555, - .child = firmware_config_table, - }, -#endif { .procname = "overflowuid", .data = &overflowuid, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, }, { .procname = "overflowgid", @@ -2206,8 +2037,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, }, #ifdef CONFIG_S390 { @@ -2252,66 +2083,9 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, -#if defined CONFIG_PRINTK - { - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_ratelimit", - .data = &printk_ratelimit_state.interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_state.burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_delay", - .data = &printk_delay_msec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &ten_thousand, - }, - { - .procname = "printk_devkmsg", - .data = devkmsg_log_str, - .maxlen = DEVKMSG_STR_MAX_SIZE, - .mode = 0644, - .proc_handler = devkmsg_sysctl_set_loglvl, - }, - { - .procname = "dmesg_restrict", - .data = &dmesg_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "kptr_restrict", - .data = &kptr_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -#endif { .procname = "ngroups_max", - .data = &ngroups_max, + .data = (void *)&ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, @@ -2323,96 +2097,6 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = &watchdog_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = &sixty, - }, - { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = &soft_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = &sysctl_hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", @@ -2515,60 +2199,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DETECT_HUNG_TASK -#ifdef CONFIG_SMP - { - .procname = "hung_task_all_cpu_backtrace", - .data = &sysctl_hung_task_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ - { - .procname = "hung_task_panic", - .data = &sysctl_hung_task_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "hung_task_check_count", - .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "hung_task_timeout_secs", - .data = &sysctl_hung_task_timeout_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_check_interval_secs", - .data = &sysctl_hung_task_check_interval_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_warnings", - .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, - }, -#endif #ifdef CONFIG_RT_MUTEXES { .procname = "max_lock_depth", @@ -2628,7 +2258,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "perf_event_max_stack", @@ -2637,7 +2267,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &six_hundred_forty_kb, + .extra2 = (void *)&six_hundred_forty_kb, }, { .procname = "perf_event_max_contexts_per_stack", @@ -2646,7 +2276,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_thousand, + .extra2 = SYSCTL_ONE_THOUSAND, }, #endif { @@ -2677,7 +2307,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = bpf_unpriv_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "bpf_stats_enabled", @@ -2709,17 +2339,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_INT_MAX, }, #endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE - { - .procname = "stack_erasing", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = stack_erasing_sysctl, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif { } }; @@ -2731,7 +2350,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = overcommit_policy_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "panic_on_oom", @@ -2740,7 +2359,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "oom_kill_allocating_task", @@ -2785,7 +2404,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = dirty_background_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_background_bytes", @@ -2793,7 +2412,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, - .extra1 = &one_ul, + .extra1 = SYSCTL_LONG_ONE, }, { .procname = "dirty_ratio", @@ -2802,7 +2421,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = dirty_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_bytes", @@ -2810,7 +2429,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, .proc_handler = dirty_bytes_handler, - .extra1 = &dirty_bytes_min, + .extra1 = (void *)&dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", @@ -2842,7 +2461,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two_hundred, + .extra2 = SYSCTL_TWO_HUNDRED, }, #ifdef CONFIG_HUGETLB_PAGE { @@ -2899,7 +2518,7 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &four, + .extra2 = SYSCTL_FOUR, }, #ifdef CONFIG_COMPACTION { @@ -2916,7 +2535,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = compaction_proactiveness_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "extfrag_threshold", @@ -2924,8 +2543,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &min_extfrag_threshold, - .extra2 = &max_extfrag_threshold, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&max_extfrag_threshold, }, { .procname = "compact_unevictable_allowed", @@ -2961,7 +2580,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &three_thousand, + .extra2 = SYSCTL_THREE_THOUSAND, }, { .procname = "percpu_pagelist_high_fraction", @@ -3040,7 +2659,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "min_slab_ratio", @@ -3049,7 +2668,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = sysctl_min_slab_ratio_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, #endif #ifdef CONFIG_SMP @@ -3183,221 +2802,6 @@ static struct ctl_table vm_table[] = { { } }; -static struct ctl_table fs_table[] = { - { - .procname = "inode-nr", - .data = &inodes_stat, - .maxlen = 2*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "inode-state", - .data = &inodes_stat, - .maxlen = 7*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "file-nr", - .data = &files_stat, - .maxlen = sizeof(files_stat), - .mode = 0444, - .proc_handler = proc_nr_files, - }, - { - .procname = "file-max", - .data = &files_stat.max_files, - .maxlen = sizeof(files_stat.max_files), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero_ul, - .extra2 = &long_max, - }, - { - .procname = "nr_open", - .data = &sysctl_nr_open, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &sysctl_nr_open_min, - .extra2 = &sysctl_nr_open_max, - }, - { - .procname = "dentry-state", - .data = &dentry_stat, - .maxlen = 6*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_dentry, - }, - { - .procname = "overflowuid", - .data = &fs_overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, - { - .procname = "overflowgid", - .data = &fs_overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, - }, -#ifdef CONFIG_FILE_LOCKING - { - .procname = "leases-enable", - .data = &leases_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_DNOTIFY - { - .procname = "dir-notify-enable", - .data = &dir_notify_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MMU -#ifdef CONFIG_FILE_LOCKING - { - .procname = "lease-break-time", - .data = &lease_break_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_AIO - { - .procname = "aio-nr", - .data = &aio_nr, - .maxlen = sizeof(aio_nr), - .mode = 0444, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "aio-max-nr", - .data = &aio_max_nr, - .maxlen = sizeof(aio_max_nr), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif /* CONFIG_AIO */ -#ifdef CONFIG_INOTIFY_USER - { - .procname = "inotify", - .mode = 0555, - .child = inotify_table, - }, -#endif -#ifdef CONFIG_FANOTIFY - { - .procname = "fanotify", - .mode = 0555, - .child = fanotify_table, - }, -#endif -#ifdef CONFIG_EPOLL - { - .procname = "epoll", - .mode = 0555, - .child = epoll_table, - }, -#endif -#endif - { - .procname = "protected_symlinks", - .data = &sysctl_protected_symlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_hardlinks", - .data = &sysctl_protected_hardlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_fifos", - .data = &sysctl_protected_fifos, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "protected_regular", - .data = &sysctl_protected_regular, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, - { - .procname = "suid_dumpable", - .data = &suid_dumpable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - { - .procname = "binfmt_misc", - .mode = 0555, - .child = sysctl_mount_point, - }, -#endif - { - .procname = "pipe-max-size", - .data = &pipe_max_size, - .maxlen = sizeof(pipe_max_size), - .mode = 0644, - .proc_handler = proc_dopipe_max_size, - }, - { - .procname = "pipe-user-pages-hard", - .data = &pipe_user_pages_hard, - .maxlen = sizeof(pipe_user_pages_hard), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "pipe-user-pages-soft", - .data = &pipe_user_pages_soft, - .maxlen = sizeof(pipe_user_pages_soft), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "mount-max", - .data = &sysctl_mount_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { } -}; - static struct ctl_table debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { @@ -3408,17 +2812,6 @@ static struct ctl_table debug_table[] = { .proc_handler = proc_dointvec }, #endif -#if defined(CONFIG_OPTPROBES) - { - .procname = "kprobes-optimization", - .data = &sysctl_kprobes_optimization, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_kprobes_optimization_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif { } }; @@ -3426,41 +2819,18 @@ static struct ctl_table dev_table[] = { { } }; -static struct ctl_table sysctl_base_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; +DECLARE_SYSCTL_BASE(kernel, kern_table); +DECLARE_SYSCTL_BASE(vm, vm_table); +DECLARE_SYSCTL_BASE(debug, debug_table); +DECLARE_SYSCTL_BASE(dev, dev_table); -int __init sysctl_init(void) +int __init sysctl_init_bases(void) { - struct ctl_table_header *hdr; + register_sysctl_base(kernel); + register_sysctl_base(vm); + register_sysctl_base(debug); + register_sysctl_base(dev); - hdr = register_sysctl_table(sysctl_base_table); - kmemleak_not_leak(hdr); return 0; } #endif /* CONFIG_SYSCTL */ diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2b4898b4752e..bcac5a9043aa 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -113,13 +113,14 @@ static void send_cpu_listeners(struct sk_buff *skb, struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); - int rc, delcount = 0; + int delcount = 0; genlmsg_end(skb, reply); - rc = 0; down_read(&listeners->sem); list_for_each_entry(s, &listeners->list, list) { + int rc; + skb_next = NULL; if (!list_is_last(&s->list, &listeners->list)) { skb_next = skb_clone(skb_cur, GFP_KERNEL); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 04bfd62f5e5c..27b7868b5c30 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -181,5 +181,14 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. +config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US + int "Clocksource watchdog maximum allowable skew (in μs)" + depends on CLOCKSOURCE_WATCHDOG + range 50 1000 + default 100 + help + Specify the maximum amount of allowable watchdog skew in + microseconds before reporting the clocksource to be unstable. + endmenu endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b7e52a642948..95d7ca35bdf2 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -107,7 +107,13 @@ static u64 suspend_start; * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as * a lower bound for cs->uncertainty_margin values when registering clocks. */ -#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC) +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US +#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US +#else +#define MAX_SKEW_USEC 100 +#endif + +#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); @@ -285,7 +291,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (cpu == smp_processor_id()) cpu = cpumask_next(cpu, cpu_online_mask); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) @@ -307,7 +313,7 @@ static void clocksource_verify_choose_cpus(void) cpu = prandom_u32() % nr_cpu_ids; cpu = cpumask_next(cpu - 1, cpu_online_mask); if (cpu >= nr_cpu_ids) - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 96b4e7810426..e13e628509fb 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -34,14 +34,20 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if * necessary. Needs siglock protection since other code may update the * expiration cache as well. + * + * Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and + * we cannot lock_task_sighand. Cannot fail if task is current. */ -void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) +int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) { u64 nsecs = rlim_new * NSEC_PER_SEC; + unsigned long irq_fl; - spin_lock_irq(&task->sighand->siglock); + if (!lock_task_sighand(task, &irq_fl)) + return -ESRCH; set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL); - spin_unlock_irq(&task->sighand->siglock); + unlock_task_sighand(task, &irq_fl); + return 0; } /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 17a283ce2b20..2d76c91b85de 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -169,6 +169,8 @@ static ktime_t tick_init_jiffy_update(void) return period; } +#define MAX_STALLED_JIFFIES 5 + static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -196,6 +198,21 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); + /* + * If jiffies update stalled for too long (timekeeper in stop_machine() + * or VMEXIT'ed for several msecs), force an update. + */ + if (ts->last_tick_jiffies != jiffies) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } else { + if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { + tick_do_update_jiffies64(now); + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } + } + if (ts->inidle) ts->got_idle_tick = 1; } @@ -768,7 +785,7 @@ static inline bool local_timer_softirq_pending(void) static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { - u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; + u64 basemono, next_tick, delta, expires; unsigned long basejiff; unsigned int seq; @@ -791,7 +808,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) * minimal delta which brings us back to this place * immediately. Lather, rinse and repeat... */ - if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || + if (rcu_needs_cpu() || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { @@ -802,10 +819,8 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) * disabled this also looks at the next expiring * hrtimer. */ - next_tmr = get_next_timer_interrupt(basejiff, basemono); - ts->next_timer = next_tmr; - /* Take the next rcu event into account */ - next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; + next_tick = get_next_timer_interrupt(basejiff, basemono); + ts->next_timer = next_tick; } /* @@ -984,6 +999,45 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) __tick_nohz_full_update_tick(ts, ktime_get()); } +/* + * A pending softirq outside an IRQ (or softirq disabled section) context + * should be waiting for ksoftirqd to handle it. Therefore we shouldn't + * reach here due to the need_resched() early check in can_stop_idle_tick(). + * + * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the + * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, + * triggering the below since wakep_softirqd() is ignored. + * + */ +static bool report_idle_softirq(void) +{ + static int ratelimit; + unsigned int pending = local_softirq_pending(); + + if (likely(!pending)) + return false; + + /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ + if (!cpu_active(smp_processor_id())) { + pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; + if (!pending) + return false; + } + + if (ratelimit < 10) + return false; + + /* On RT, softirqs handling may be waiting on some lock */ + if (!local_bh_blocked()) + return false; + + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + + return true; +} + static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* @@ -1010,17 +1064,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (need_resched()) return false; - if (unlikely(local_softirq_pending())) { - static int ratelimit; - - if (ratelimit < 10 && !local_bh_blocked() && - (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", - (unsigned int) local_softirq_pending()); - ratelimit++; - } + if (unlikely(report_idle_softirq())) return false; - } if (tick_nohz_full_enabled()) { /* diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index d952ae393423..504649513399 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -49,6 +49,8 @@ enum tick_nohz_mode { * @timer_expires_base: Base time clock monotonic for @timer_expires * @next_timer: Expiry time of next expiring timer for debugging purpose only * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick + * @last_tick_jiffies: Value of jiffies seen on last tick + * @stalled_jiffies: Number of stalled jiffies detected across ticks */ struct tick_sched { struct hrtimer sched_timer; @@ -77,6 +79,8 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; atomic_t tick_dep_mask; + unsigned long last_tick_jiffies; + unsigned int stalled_jiffies; }; extern struct tick_sched *tick_get_tick_sched(int cpu); diff --git a/kernel/torture.c b/kernel/torture.c index ef27a6c82451..789aeb0e1159 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -911,7 +911,7 @@ void torture_kthread_stopping(char *title) { char buf[128]; - snprintf(buf, sizeof(buf), "Stopping %s", title); + snprintf(buf, sizeof(buf), "%s is stopping", title); VERBOSE_TOROUT_STRING(buf); while (!kthread_should_stop()) { torture_shutdown_absorb(title); @@ -931,12 +931,14 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, int ret = 0; VERBOSE_TOROUT_STRING(m); - *tp = kthread_run(fn, arg, "%s", s); + *tp = kthread_create(fn, arg, "%s", s); if (IS_ERR(*tp)) { ret = PTR_ERR(*tp); TOROUT_ERRSTRING(f); *tp = NULL; + return ret; } + wake_up_process(*tp); // Process is sleeping, so ordering provided. torture_shuffle_task_register(*tp); return ret; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 420ff4bc67fd..9bb54c0b3b2d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool +config HAVE_RETHOOK + bool + +config RETHOOK + bool + depends on HAVE_RETHOOK + help + Enable generic return hooking feature. This is an internal + API, which will be used by other function-entry hooking + features like fprobe and kprobes. + config HAVE_FUNCTION_TRACER bool help @@ -70,6 +81,19 @@ config HAVE_C_RECORDMCOUNT help C version of recordmcount available? +config HAVE_BUILDTIME_MCOUNT_SORT + bool + help + An architecture selects this if it sorts the mcount_loc section + at build time. + +config BUILDTIME_MCOUNT_SORT + bool + default y + depends on HAVE_BUILDTIME_MCOUNT_SORT && DYNAMIC_FTRACE + help + Sort the mcount_loc section at build time. + config TRACER_MAX_TRACE bool @@ -223,6 +247,21 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config FPROBE + bool "Kernel Function Probe (fprobe)" + depends on FUNCTION_TRACER + depends on DYNAMIC_FTRACE_WITH_REGS + depends on HAVE_RETHOOK + select RETHOOK + default n + help + This option enables kernel function probe (fprobe) based on ftrace. + The fprobe is similar to kprobes, but probes only for kernel function + entries and exits. This also can probe multiple functions by one + fprobe. + + If unsure, say N. + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER @@ -724,6 +763,20 @@ config SYNTH_EVENTS If in doubt, say N. +config USER_EVENTS + bool "User trace events" + select TRACING + select DYNAMIC_EVENTS + help + User trace events are user-defined trace events that + can be used like an existing kernel trace event. User trace + events are generated by writing to a tracefs file. User + processes can determine if their tracing events should be + generated by memory mapping a tracefs file and checking for + an associated byte being non-zero. + + If in doubt, say N. + config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG @@ -915,6 +968,20 @@ config EVENT_TRACE_TEST_SYSCALLS TBD - enable a way to actually call the syscalls as we test their events +config FTRACE_SORT_STARTUP_TEST + bool "Verify compile time sorting of ftrace functions" + depends on DYNAMIC_FTRACE + depends on BUILDTIME_MCOUNT_SORT + help + Sorting of the mcount_loc sections that is used to find the + where the ftrace knows where to patch functions for tracing + and other callbacks is done at compile time. But if the sort + is not done correctly, it will cause non-deterministic failures. + When this is set, the sorted sections will be verified that they + are in deed sorted and will warn if they are not. + + If unsure, say N + config RING_BUFFER_STARTUP_TEST bool "Ring buffer startup self test" depends on RING_BUFFER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index bedc5caceec7..d77cd8032213 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -82,6 +82,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o +obj-$(CONFIG_USER_EVENTS) += trace_events_user.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o @@ -97,6 +98,8 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o +obj-$(CONFIG_FPROBE) += fprobe.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index af68a67179b4..4d5629196d01 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -310,10 +310,20 @@ record_it: local_irq_restore(flags); } -static void blk_trace_free(struct blk_trace *bt) +static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) { relay_close(bt->rchan); - debugfs_remove(bt->dir); + + /* + * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created + * under 'q->debugfs_dir', thus lookup and remove them. + */ + if (!bt->dir) { + debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir)); + debugfs_remove(debugfs_lookup("msg", q->debugfs_dir)); + } else { + debugfs_remove(bt->dir); + } free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -335,10 +345,10 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } -static void blk_trace_cleanup(struct blk_trace *bt) +static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); put_probe_ref(); } @@ -352,7 +362,7 @@ static int __blk_trace_remove(struct request_queue *q) return -EINVAL; if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(bt); + blk_trace_cleanup(q, bt); return 0; } @@ -572,7 +582,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: if (ret) - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } @@ -1616,7 +1626,7 @@ static int blk_trace_remove_queue(struct request_queue *q) put_probe_ref(); synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); return 0; } @@ -1647,7 +1657,7 @@ static int blk_trace_setup_queue(struct request_queue *q, return 0; free_bt: - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } @@ -1892,7 +1902,6 @@ void blk_fill_rwbs(char *rwbs, unsigned int op) switch (op & REQ_OP_MASK) { case REQ_OP_WRITE: - case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; break; case REQ_OP_DISCARD: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 21aa30644219..7fa2ebc07f60 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,6 +17,9 @@ #include <linux/error-injection.h> #include <linux/btf_ids.h> #include <linux/bpf_lsm.h> +#include <linux/fprobe.h> +#include <linux/bsearch.h> +#include <linux/sort.h> #include <net/bpf_sk_storage.h> @@ -77,6 +80,8 @@ u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags, const struct btf **btf, s32 *btf_id); +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx); +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx); /** * trace_call_bpf - invoke BPF program @@ -332,8 +337,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(uaccess_kernel())) - return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; @@ -835,8 +838,6 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) */ if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(uaccess_kernel())) - return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; @@ -1036,6 +1037,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs) +{ + return bpf_kprobe_multi_entry_ip(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = { + .func = bpf_get_func_ip_kprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs) +{ + return bpf_kprobe_multi_cookie(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = { + .func = bpf_get_attach_cookie_kprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) { struct bpf_trace_run_ctx *run_ctx; @@ -1235,6 +1260,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + case BPF_FUNC_copy_from_user_task: + return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_per_cpu_ptr: @@ -1277,9 +1304,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: - return &bpf_get_func_ip_proto_kprobe; + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ? + &bpf_get_func_ip_proto_kprobe_multi : + &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: - return &bpf_get_attach_cookie_proto_trace; + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ? + &bpf_get_attach_cookie_proto_kmulti : + &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -1562,6 +1593,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { extern const struct bpf_func_proto bpf_skb_output_proto; extern const struct bpf_func_proto bpf_xdp_output_proto; +extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto; BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) @@ -1661,6 +1693,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_from_file_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; + case BPF_FUNC_xdp_get_buff_len: + return &bpf_xdp_get_buff_len_trace_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? @@ -2176,3 +2210,314 @@ static int __init bpf_event_init(void) fs_initcall(bpf_event_init); #endif /* CONFIG_MODULES */ + +#ifdef CONFIG_FPROBE +struct bpf_kprobe_multi_link { + struct bpf_link link; + struct fprobe fp; + unsigned long *addrs; + u64 *cookies; + u32 cnt; +}; + +struct bpf_kprobe_multi_run_ctx { + struct bpf_run_ctx run_ctx; + struct bpf_kprobe_multi_link *link; + unsigned long entry_ip; +}; + +static void bpf_kprobe_multi_link_release(struct bpf_link *link) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + unregister_fprobe(&kmulti_link->fp); +} + +static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + kvfree(kmulti_link->addrs); + kvfree(kmulti_link->cookies); + kfree(kmulti_link); +} + +static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { + .release = bpf_kprobe_multi_link_release, + .dealloc = bpf_kprobe_multi_link_dealloc, +}; + +static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) +{ + const struct bpf_kprobe_multi_link *link = priv; + unsigned long *addr_a = a, *addr_b = b; + u64 *cookie_a, *cookie_b; + unsigned long tmp1; + u64 tmp2; + + cookie_a = link->cookies + (addr_a - link->addrs); + cookie_b = link->cookies + (addr_b - link->addrs); + + /* swap addr_a/addr_b and cookie_a/cookie_b values */ + tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1; + tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2; +} + +static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) +{ + const unsigned long *addr_a = a, *addr_b = b; + + if (*addr_a == *addr_b) + return 0; + return *addr_a < *addr_b ? -1 : 1; +} + +static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv) +{ + return __bpf_kprobe_multi_cookie_cmp(a, b); +} + +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + struct bpf_kprobe_multi_run_ctx *run_ctx; + struct bpf_kprobe_multi_link *link; + u64 *cookie, entry_ip; + unsigned long *addr; + + if (WARN_ON_ONCE(!ctx)) + return 0; + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + link = run_ctx->link; + if (!link->cookies) + return 0; + entry_ip = run_ctx->entry_ip; + addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip), + __bpf_kprobe_multi_cookie_cmp); + if (!addr) + return 0; + cookie = link->cookies + (addr - link->addrs); + return *cookie; +} + +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + struct bpf_kprobe_multi_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + return run_ctx->entry_ip; +} + +static int +kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, + unsigned long entry_ip, struct pt_regs *regs) +{ + struct bpf_kprobe_multi_run_ctx run_ctx = { + .link = link, + .entry_ip = entry_ip, + }; + struct bpf_run_ctx *old_run_ctx; + int err; + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + err = 0; + goto out; + } + + migrate_disable(); + rcu_read_lock(); + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + err = bpf_prog_run(link->link.prog, regs); + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + + out: + __this_cpu_dec(bpf_prog_active); + return err; +} + +static void +kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct bpf_kprobe_multi_link *link; + + link = container_of(fp, struct bpf_kprobe_multi_link, fp); + kprobe_multi_link_prog_run(link, entry_ip, regs); +} + +static int +kprobe_multi_resolve_syms(const void *usyms, u32 cnt, + unsigned long *addrs) +{ + unsigned long addr, size; + const char **syms; + int err = -ENOMEM; + unsigned int i; + char *func; + + size = cnt * sizeof(*syms); + syms = kvzalloc(size, GFP_KERNEL); + if (!syms) + return -ENOMEM; + + func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!func) + goto error; + + if (copy_from_user(syms, usyms, size)) { + err = -EFAULT; + goto error; + } + + for (i = 0; i < cnt; i++) { + err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN); + if (err == KSYM_NAME_LEN) + err = -E2BIG; + if (err < 0) + goto error; + err = -EINVAL; + addr = kallsyms_lookup_name(func); + if (!addr) + goto error; + if (!kallsyms_lookup_size_offset(addr, &size, NULL)) + goto error; + addr = ftrace_location_range(addr, addr + size - 1); + if (!addr) + goto error; + addrs[i] = addr; + } + + err = 0; +error: + kvfree(syms); + kfree(func); + return err; +} + +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_kprobe_multi_link *link = NULL; + struct bpf_link_primer link_primer; + void __user *ucookies; + unsigned long *addrs; + u32 flags, cnt, size; + void __user *uaddrs; + u64 *cookies = NULL; + void __user *usyms; + int err; + + /* no support for 32bit archs yet */ + if (sizeof(u64) != sizeof(void *)) + return -EOPNOTSUPP; + + if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + + flags = attr->link_create.kprobe_multi.flags; + if (flags & ~BPF_F_KPROBE_MULTI_RETURN) + return -EINVAL; + + uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs); + usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms); + if (!!uaddrs == !!usyms) + return -EINVAL; + + cnt = attr->link_create.kprobe_multi.cnt; + if (!cnt) + return -EINVAL; + + size = cnt * sizeof(*addrs); + addrs = kvmalloc(size, GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + if (uaddrs) { + if (copy_from_user(addrs, uaddrs, size)) { + err = -EFAULT; + goto error; + } + } else { + err = kprobe_multi_resolve_syms(usyms, cnt, addrs); + if (err) + goto error; + } + + ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); + if (ucookies) { + cookies = kvmalloc(size, GFP_KERNEL); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, size)) { + err = -EFAULT; + goto error; + } + } + + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) { + err = -ENOMEM; + goto error; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, + &bpf_kprobe_multi_link_lops, prog); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error; + + if (flags & BPF_F_KPROBE_MULTI_RETURN) + link->fp.exit_handler = kprobe_multi_link_handler; + else + link->fp.entry_handler = kprobe_multi_link_handler; + + link->addrs = addrs; + link->cookies = cookies; + link->cnt = cnt; + + if (cookies) { + /* + * Sorting addresses will trigger sorting cookies as well + * (check bpf_kprobe_multi_cookie_swap). This way we can + * find cookie based on the address in bpf_get_attach_cookie + * helper. + */ + sort_r(addrs, cnt, sizeof(*addrs), + bpf_kprobe_multi_cookie_cmp, + bpf_kprobe_multi_cookie_swap, + link); + } + + err = register_fprobe_ips(&link->fp, addrs, cnt); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + + return bpf_link_settle(&link_primer); + +error: + kfree(link); + kvfree(addrs); + kvfree(cookies); + return err; +} +#else /* !CONFIG_FPROBE */ +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + return 0; +} +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + return 0; +} +#endif diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 22061d38fc00..19028e072cdb 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -415,7 +415,9 @@ free: static void ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { unsigned long long timestamp; int index; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c new file mode 100644 index 000000000000..8b2dd5b9dcd1 --- /dev/null +++ b/kernel/trace/fprobe.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fprobe - Simple ftrace probe wrapper for function entry. + */ +#define pr_fmt(fmt) "fprobe: " fmt + +#include <linux/err.h> +#include <linux/fprobe.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/rethook.h> +#include <linux/slab.h> +#include <linux/sort.h> + +#include "trace.h" + +struct fprobe_rethook_node { + struct rethook_node node; + unsigned long entry_ip; +}; + +static void fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe_rethook_node *fpr; + struct rethook_node *rh; + struct fprobe *fp; + int bit; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + + if (fp->entry_handler) + fp->entry_handler(fp, ip, ftrace_get_regs(fregs)); + + if (fp->exit_handler) { + rh = rethook_try_get(fp->rethook); + if (!rh) { + fp->nmissed++; + goto out; + } + fpr = container_of(rh, struct fprobe_rethook_node, node); + fpr->entry_ip = ip; + rethook_hook(rh, ftrace_get_regs(fregs), true); + } + +out: + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(fprobe_handler); + +static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe *fp = container_of(ops, struct fprobe, ops); + + if (unlikely(kprobe_running())) { + fp->nmissed++; + return; + } + kprobe_busy_begin(); + fprobe_handler(ip, parent_ip, ops, fregs); + kprobe_busy_end(); +} + +static void fprobe_exit_handler(struct rethook_node *rh, void *data, + struct pt_regs *regs) +{ + struct fprobe *fp = (struct fprobe *)data; + struct fprobe_rethook_node *fpr; + + if (!fp || fprobe_disabled(fp)) + return; + + fpr = container_of(rh, struct fprobe_rethook_node, node); + + fp->exit_handler(fp, fpr->entry_ip, regs); +} +NOKPROBE_SYMBOL(fprobe_exit_handler); + +/* Convert ftrace location address from symbols */ +static unsigned long *get_ftrace_locations(const char **syms, int num) +{ + unsigned long addr, size; + unsigned long *addrs; + int i; + + /* Convert symbols to symbol address */ + addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < num; i++) { + addr = kallsyms_lookup_name(syms[i]); + if (!addr) /* Maybe wrong symbol */ + goto error; + + /* Convert symbol address to ftrace location. */ + if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size) + goto error; + + addr = ftrace_location_range(addr, addr + size - 1); + if (!addr) /* No dynamic ftrace there. */ + goto error; + + addrs[i] = addr; + } + + return addrs; + +error: + kfree(addrs); + + return ERR_PTR(-ENOENT); +} + +static void fprobe_init(struct fprobe *fp) +{ + fp->nmissed = 0; + if (fprobe_shared_with_kprobes(fp)) + fp->ops.func = fprobe_kprobe_handler; + else + fp->ops.func = fprobe_handler; + fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; +} + +static int fprobe_init_rethook(struct fprobe *fp, int num) +{ + int i, size; + + if (num < 0) + return -EINVAL; + + if (!fp->exit_handler) { + fp->rethook = NULL; + return 0; + } + + /* Initialize rethook if needed */ + size = num * num_possible_cpus() * 2; + if (size < 0) + return -E2BIG; + + fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); + for (i = 0; i < size; i++) { + struct rethook_node *node; + + node = kzalloc(sizeof(struct fprobe_rethook_node), GFP_KERNEL); + if (!node) { + rethook_free(fp->rethook); + fp->rethook = NULL; + return -ENOMEM; + } + rethook_add_node(fp->rethook, node); + } + return 0; +} + +static void fprobe_fail_cleanup(struct fprobe *fp) +{ + if (fp->rethook) { + /* Don't need to cleanup rethook->handler because this is not used. */ + rethook_free(fp->rethook); + fp->rethook = NULL; + } + ftrace_free_filter(&fp->ops); +} + +/** + * register_fprobe() - Register fprobe to ftrace by pattern. + * @fp: A fprobe data structure to be registered. + * @filter: A wildcard pattern of probed symbols. + * @notfilter: A wildcard pattern of NOT probed symbols. + * + * Register @fp to ftrace for enabling the probe on the symbols matched to @filter. + * If @notfilter is not NULL, the symbols matched the @notfilter are not probed. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) +{ + struct ftrace_hash *hash; + unsigned char *str; + int ret, len; + + if (!fp || !filter) + return -EINVAL; + + fprobe_init(fp); + + len = strlen(filter); + str = kstrdup(filter, GFP_KERNEL); + ret = ftrace_set_filter(&fp->ops, str, len, 0); + kfree(str); + if (ret) + return ret; + + if (notfilter) { + len = strlen(notfilter); + str = kstrdup(notfilter, GFP_KERNEL); + ret = ftrace_set_notrace(&fp->ops, str, len, 0); + kfree(str); + if (ret) + goto out; + } + + /* TODO: + * correctly calculate the total number of filtered symbols + * from both filter and notfilter. + */ + hash = fp->ops.local_hash.filter_hash; + if (WARN_ON_ONCE(!hash)) + goto out; + + ret = fprobe_init_rethook(fp, (int)hash->count); + if (!ret) + ret = register_ftrace_function(&fp->ops); + +out: + if (ret) + fprobe_fail_cleanup(fp); + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe); + +/** + * register_fprobe_ips() - Register fprobe to ftrace by address. + * @fp: A fprobe data structure to be registered. + * @addrs: An array of target ftrace location addresses. + * @num: The number of entries of @addrs. + * + * Register @fp to ftrace for enabling the probe on the address given by @addrs. + * The @addrs must be the addresses of ftrace location address, which may be + * the symbol address + arch-dependent offset. + * If you unsure what this mean, please use other registration functions. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) +{ + int ret; + + if (!fp || !addrs || num <= 0) + return -EINVAL; + + fprobe_init(fp); + + ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0); + if (ret) + return ret; + + ret = fprobe_init_rethook(fp, num); + if (!ret) + ret = register_ftrace_function(&fp->ops); + + if (ret) + fprobe_fail_cleanup(fp); + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe_ips); + +/** + * register_fprobe_syms() - Register fprobe to ftrace by symbols. + * @fp: A fprobe data structure to be registered. + * @syms: An array of target symbols. + * @num: The number of entries of @syms. + * + * Register @fp to the symbols given by @syms array. This will be useful if + * you are sure the symbols exist in the kernel. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) +{ + unsigned long *addrs; + int ret; + + if (!fp || !syms || num <= 0) + return -EINVAL; + + addrs = get_ftrace_locations(syms, num); + if (IS_ERR(addrs)) + return PTR_ERR(addrs); + + ret = register_fprobe_ips(fp, addrs, num); + + kfree(addrs); + + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe_syms); + +/** + * unregister_fprobe() - Unregister fprobe from ftrace + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret; + + if (!fp || fp->ops.func != fprobe_handler) + return -EINVAL; + + /* + * rethook_free() starts disabling the rethook, but the rethook handlers + * may be running on other processors at this point. To make sure that all + * current running handlers are finished, call unregister_ftrace_function() + * after this. + */ + if (fp->rethook) + rethook_free(fp->rethook); + + ret = unregister_ftrace_function(&fp->ops); + if (ret < 0) + return ret; + + ftrace_free_filter(&fp->ops); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_fprobe); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index be5f6b32a012..4f1d2f5e7263 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1568,17 +1568,34 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) } /** - * ftrace_location - return true if the ip giving is a traced location + * ftrace_location - return the ftrace location * @ip: the instruction pointer to check * - * Returns rec->ip if @ip given is a pointer to a ftrace location. - * That is, the instruction that is either a NOP or call to - * the function tracer. It checks the ftrace internal tables to - * determine if the address belongs or not. + * If @ip matches the ftrace location, return @ip. + * If @ip matches sym+0, return sym's ftrace location. + * Otherwise, return 0. */ unsigned long ftrace_location(unsigned long ip) { - return ftrace_location_range(ip, ip); + struct dyn_ftrace *rec; + unsigned long offset; + unsigned long size; + + rec = lookup_rec(ip, ip); + if (!rec) { + if (!kallsyms_lookup_size_offset(ip, &size, &offset)) + goto out; + + /* map sym+0 to __fentry__ */ + if (!offset) + rec = lookup_rec(ip, ip + size - 1); + } + + if (rec) + return rec->ip; + +out: + return 0; } /** @@ -4958,11 +4975,12 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static int -ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) { struct ftrace_func_entry *entry; - if (!ftrace_location(ip)) + ip = ftrace_location(ip); + if (!ip) return -EINVAL; if (remove) { @@ -4977,8 +4995,29 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) } static int +ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips, + unsigned int cnt, int remove) +{ + unsigned int i; + int err; + + for (i = 0; i < cnt; i++) { + err = __ftrace_match_addr(hash, ips[i], remove); + if (err) { + /* + * This expects the @hash is a temporary hash and if this + * fails the caller must free the @hash. + */ + return err; + } + } + return 0; +} + +static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, - unsigned long ip, int remove, int reset, int enable) + unsigned long *ips, unsigned int cnt, + int remove, int reset, int enable) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -5008,8 +5047,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, ret = -EINVAL; goto out_regex_unlock; } - if (ip) { - ret = ftrace_match_addr(hash, ip, remove); + if (ips) { + ret = ftrace_match_addr(hash, ips, cnt, remove); if (ret < 0) goto out_regex_unlock; } @@ -5026,10 +5065,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } static int -ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, - int reset, int enable) +ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, + int remove, int reset, int enable) { - return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable); } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -5110,11 +5149,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) struct ftrace_func_entry *entry; struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; - int ret = -EBUSY; + int ret = -ENODEV; mutex_lock(&direct_mutex); + ip = ftrace_location(ip); + if (!ip) + goto out_unlock; + /* See if there's a direct function at @ip already */ + ret = -EBUSY; if (ftrace_find_rec_direct(ip)) goto out_unlock; @@ -5222,6 +5266,10 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) mutex_lock(&direct_mutex); + ip = ftrace_location(ip); + if (!ip) + goto out_unlock; + entry = find_direct_entry(&ip, NULL); if (!entry) goto out_unlock; @@ -5354,6 +5402,11 @@ int modify_ftrace_direct(unsigned long ip, mutex_lock(&direct_mutex); mutex_lock(&ftrace_lock); + + ip = ftrace_location(ip); + if (!ip) + goto out_unlock; + entry = find_direct_entry(&ip, &rec); if (!entry) goto out_unlock; @@ -5634,11 +5687,30 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) { ftrace_ops_init(ops); - return ftrace_set_addr(ops, ip, remove, reset, 1); + return ftrace_set_addr(ops, &ip, 1, remove, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); /** + * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses + * @ops - the ops to set the filter with + * @ips - the array of addresses to add to or remove from the filter. + * @cnt - the number of addresses in @ips + * @remove - non zero to remove ips from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ips array or any ip specified within is NULL , it fails to update filter. + */ +int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips, + unsigned int cnt, int remove, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_addr(ops, ips, cnt, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ips); + +/** * ftrace_ops_set_global_filter - setup ops to use global filters * @ops - the ops which will use the global filters * @@ -5659,7 +5731,7 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) { - return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); + return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable); } /** @@ -6394,6 +6466,27 @@ static int ftrace_cmp_ips(const void *a, const void *b) return 0; } +#ifdef CONFIG_FTRACE_SORT_STARTUP_TEST +static void test_is_sorted(unsigned long *start, unsigned long count) +{ + int i; + + for (i = 1; i < count; i++) { + if (WARN(start[i - 1] > start[i], + "[%d] %pS at %lx is not sorted with %pS at %lx\n", i, + (void *)start[i - 1], start[i - 1], + (void *)start[i], start[i])) + break; + } + if (i == count) + pr_info("ftrace section at %px sorted properly\n", start); +} +#else +static void test_is_sorted(unsigned long *start, unsigned long count) +{ +} +#endif + static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) @@ -6412,8 +6505,17 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; - sort(start, count, sizeof(*start), - ftrace_cmp_ips, NULL); + /* + * Sorting mcount in vmlinux at build time depend on + * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in + * modules can not be sorted at build time. + */ + if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) { + sort(start, count, sizeof(*start), + ftrace_cmp_ips, NULL); + } else { + test_is_sorted(start, count); + } start_pg = ftrace_allocate_pages(count); if (!start_pg) @@ -7066,6 +7168,8 @@ void __init ftrace_free_init_mem(void) void *start = (void *)(&__init_begin); void *end = (void *)(&__init_end); + ftrace_boot_snapshot(); + ftrace_free_mem(NULL, start, end); } @@ -7161,7 +7265,6 @@ static int __init ftrace_nodyn_init(void) core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } -static inline void ftrace_startup_enable(int command) { } static inline void ftrace_startup_all(int command) { } # define ftrace_startup_sysctl() do { } while (0) @@ -7317,7 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) static void ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { struct trace_array *tr = data; struct trace_pid_list *pid_list; @@ -7761,7 +7866,7 @@ int ftrace_is_dead(void) /** * register_ftrace_function - register a function for profiling - * @ops - ops structure that holds the function for profiling. + * @ops: ops structure that holds the function for profiling. * * Register a function to be called by all functions in the * kernel. @@ -7788,7 +7893,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_function); /** * unregister_ftrace_function - unregister a function for profiling. - * @ops - ops structure that holds the function to unregister + * @ops: ops structure that holds the function to unregister * * Unregister a function that was added to be called by ftrace profiling. */ diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c new file mode 100644 index 000000000000..ab463a4d2b23 --- /dev/null +++ b/kernel/trace/rethook.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "rethook: " fmt + +#include <linux/bug.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/preempt.h> +#include <linux/rethook.h> +#include <linux/slab.h> +#include <linux/sort.h> + +/* Return hook list (shadow stack by list) */ + +/* + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any kretprobe instances associated with + * this task. These left over instances represent probed functions that + * have been called but will never return. + */ +void rethook_flush_task(struct task_struct *tk) +{ + struct rethook_node *rhn; + struct llist_node *node; + + node = __llist_del_all(&tk->rethooks); + while (node) { + rhn = container_of(node, struct rethook_node, llist); + node = node->next; + preempt_disable(); + rethook_recycle(rhn); + preempt_enable(); + } +} + +static void rethook_free_rcu(struct rcu_head *head) +{ + struct rethook *rh = container_of(head, struct rethook, rcu); + struct rethook_node *rhn; + struct freelist_node *node; + int count = 1; + + node = rh->pool.head; + while (node) { + rhn = container_of(node, struct rethook_node, freelist); + node = node->next; + kfree(rhn); + count++; + } + + /* The rh->ref is the number of pooled node + 1 */ + if (refcount_sub_and_test(count, &rh->ref)) + kfree(rh); +} + +/** + * rethook_free() - Free struct rethook. + * @rh: the struct rethook to be freed. + * + * Free the rethook. Before calling this function, user must ensure the + * @rh::data is cleaned if needed (or, the handler can access it after + * calling this function.) This function will set the @rh to be freed + * after all rethook_node are freed (not soon). And the caller must + * not touch @rh after calling this. + */ +void rethook_free(struct rethook *rh) +{ + rcu_assign_pointer(rh->handler, NULL); + + call_rcu(&rh->rcu, rethook_free_rcu); +} + +/** + * rethook_alloc() - Allocate struct rethook. + * @data: a data to pass the @handler when hooking the return. + * @handler: the return hook callback function. + * + * Allocate and initialize a new rethook with @data and @handler. + * Return NULL if memory allocation fails or @handler is NULL. + * Note that @handler == NULL means this rethook is going to be freed. + */ +struct rethook *rethook_alloc(void *data, rethook_handler_t handler) +{ + struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); + + if (!rh || !handler) + return NULL; + + rh->data = data; + rh->handler = handler; + rh->pool.head = NULL; + refcount_set(&rh->ref, 1); + + return rh; +} + +/** + * rethook_add_node() - Add a new node to the rethook. + * @rh: the struct rethook. + * @node: the struct rethook_node to be added. + * + * Add @node to @rh. User must allocate @node (as a part of user's + * data structure.) The @node fields are initialized in this function. + */ +void rethook_add_node(struct rethook *rh, struct rethook_node *node) +{ + node->rethook = rh; + freelist_add(&node->freelist, &rh->pool); + refcount_inc(&rh->ref); +} + +static void free_rethook_node_rcu(struct rcu_head *head) +{ + struct rethook_node *node = container_of(head, struct rethook_node, rcu); + + if (refcount_dec_and_test(&node->rethook->ref)) + kfree(node->rethook); + kfree(node); +} + +/** + * rethook_recycle() - return the node to rethook. + * @node: The struct rethook_node to be returned. + * + * Return back the @node to @node::rethook. If the @node::rethook is already + * marked as freed, this will free the @node. + */ +void rethook_recycle(struct rethook_node *node) +{ + lockdep_assert_preemption_disabled(); + + if (likely(READ_ONCE(node->rethook->handler))) + freelist_add(&node->freelist, &node->rethook->pool); + else + call_rcu(&node->rcu, free_rethook_node_rcu); +} +NOKPROBE_SYMBOL(rethook_recycle); + +/** + * rethook_try_get() - get an unused rethook node. + * @rh: The struct rethook which pools the nodes. + * + * Get an unused rethook node from @rh. If the node pool is empty, this + * will return NULL. Caller must disable preemption. + */ +struct rethook_node *rethook_try_get(struct rethook *rh) +{ + rethook_handler_t handler = READ_ONCE(rh->handler); + struct freelist_node *fn; + + lockdep_assert_preemption_disabled(); + + /* Check whether @rh is going to be freed. */ + if (unlikely(!handler)) + return NULL; + + fn = freelist_try_get(&rh->pool); + if (!fn) + return NULL; + + return container_of(fn, struct rethook_node, freelist); +} +NOKPROBE_SYMBOL(rethook_try_get); + +/** + * rethook_hook() - Hook the current function return. + * @node: The struct rethook node to hook the function return. + * @regs: The struct pt_regs for the function entry. + * @mcount: True if this is called from mcount(ftrace) context. + * + * Hook the current running function return. This must be called when the + * function entry (or at least @regs must be the registers of the function + * entry.) @mcount is used for identifying the context. If this is called + * from ftrace (mcount) callback, @mcount must be set true. If this is called + * from the real function entry (e.g. kprobes) @mcount must be set false. + * This is because the way to hook the function return depends on the context. + */ +void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount) +{ + arch_rethook_prepare(node, regs, mcount); + __llist_add(&node->llist, ¤t->rethooks); +} +NOKPROBE_SYMBOL(rethook_hook); + +/* This assumes the 'tsk' is the current task or is not running. */ +static unsigned long __rethook_find_ret_addr(struct task_struct *tsk, + struct llist_node **cur) +{ + struct rethook_node *rh = NULL; + struct llist_node *node = *cur; + + if (!node) + node = tsk->rethooks.first; + else + node = node->next; + + while (node) { + rh = container_of(node, struct rethook_node, llist); + if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) { + *cur = node; + return rh->ret_addr; + } + node = node->next; + } + return 0; +} +NOKPROBE_SYMBOL(__rethook_find_ret_addr); + +/** + * rethook_find_ret_addr -- Find correct return address modified by rethook + * @tsk: Target task + * @frame: A frame pointer + * @cur: a storage of the loop cursor llist_node pointer for next call + * + * Find the correct return address modified by a rethook on @tsk in unsigned + * long type. + * The @tsk must be 'current' or a task which is not running. @frame is a hint + * to get the currect return address - which is compared with the + * rethook::frame field. The @cur is a loop cursor for searching the + * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the + * first call, but '@cur' itself must NOT NULL. + * + * Returns found address value or zero if not found. + */ +unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame, + struct llist_node **cur) +{ + struct rethook_node *rhn = NULL; + unsigned long ret; + + if (WARN_ON_ONCE(!cur)) + return 0; + + if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + return 0; + + do { + ret = __rethook_find_ret_addr(tsk, cur); + if (!ret) + break; + rhn = container_of(*cur, struct rethook_node, llist); + } while (rhn->frame != frame); + + return ret; +} +NOKPROBE_SYMBOL(rethook_find_ret_addr); + +void __weak arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + /* + * Do nothing by default. If the architecture which uses a + * frame pointer to record real return address on the stack, + * it should fill this function to fixup the return address + * so that stacktrace works from the rethook handler. + */ +} + +/* This function will be called from each arch-defined trampoline. */ +unsigned long rethook_trampoline_handler(struct pt_regs *regs, + unsigned long frame) +{ + struct llist_node *first, *node = NULL; + unsigned long correct_ret_addr; + rethook_handler_t handler; + struct rethook_node *rhn; + + correct_ret_addr = __rethook_find_ret_addr(current, &node); + if (!correct_ret_addr) { + pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n"); + BUG_ON(1); + } + + instruction_pointer_set(regs, correct_ret_addr); + + /* + * These loops must be protected from rethook_free_rcu() because those + * are accessing 'rhn->rethook'. + */ + preempt_disable(); + + /* + * Run the handler on the shadow stack. Do not unlink the list here because + * stackdump inside the handlers needs to decode it. + */ + first = current->rethooks.first; + while (first) { + rhn = container_of(first, struct rethook_node, llist); + if (WARN_ON_ONCE(rhn->frame != frame)) + break; + handler = READ_ONCE(rhn->rethook->handler); + if (handler) + handler(rhn, rhn->rethook->data, regs); + + if (first == node) + break; + first = first->next; + } + + /* Fixup registers for returning to correct address. */ + arch_rethook_fixup_return(regs, correct_ret_addr); + + /* Unlink used shadow stack */ + first = current->rethooks.first; + current->rethooks.first = node->next; + node->next = NULL; + + while (first) { + rhn = container_of(first, struct rethook_node, llist); + first = first->next; + rethook_recycle(rhn); + } + preempt_enable(); + + return correct_ret_addr; +} +NOKPROBE_SYMBOL(rethook_trampoline_handler); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 78ea542ce3bc..f4de111fa18f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -185,6 +185,7 @@ static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; static bool allocate_snapshot; +static bool snapshot_at_boot; static int __init set_cmdline_ftrace(char *str) { @@ -230,12 +231,21 @@ static int __init boot_alloc_snapshot(char *str) __setup("alloc_snapshot", boot_alloc_snapshot); +static int __init boot_snapshot(char *str) +{ + snapshot_at_boot = true; + boot_alloc_snapshot(str); + return 1; +} +__setup("ftrace_boot_snapshot", boot_snapshot); + + static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); - return 0; + return 1; } __setup("trace_options=", set_trace_boot_options); @@ -246,12 +256,16 @@ static int __init set_trace_boot_clock(char *str) { strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; - return 0; + return 1; } __setup("trace_clock=", set_trace_boot_clock); static int __init set_tracepoint_printk(char *str) { + /* Ignore the "tp_printk_stop_on_boot" param */ + if (*str == '_') + return 0; + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) tracepoint_printk = 1; return 1; @@ -980,6 +994,8 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev ring_buffer_write(buffer, event->array[0], &event->array[1]); /* Release the temp buffer */ this_cpu_dec(trace_buffered_event_cnt); + /* ring_buffer_unlock_commit() enables preemption */ + preempt_enable_notrace(); } else ring_buffer_unlock_commit(buffer, event); } @@ -1468,10 +1484,12 @@ static int __init set_buf_size(char *str) if (!str) return 0; buf_size = memparse(str, &str); - /* nr_entries can not be zero */ - if (buf_size == 0) - return 0; - trace_buf_size = buf_size; + /* + * nr_entries can not be zero and the startup + * tests require some buffer space. Therefore + * ensure we have at least 4096 bytes of buffer. + */ + trace_buf_size = max(4096UL, buf_size); return 1; } __setup("trace_buf_size=", set_buf_size); @@ -2601,6 +2619,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_HARDIRQ; if (in_serving_softirq()) trace_flags |= TRACE_FLAG_SOFTIRQ; + if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) + trace_flags |= TRACE_FLAG_BH_OFF; if (tif_need_resched()) trace_flags |= TRACE_FLAG_NEED_RESCHED; @@ -2745,8 +2765,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, *current_rb = tr->array_buffer.buffer; if (!tr->no_filter_buffering_ref && - (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && - (entry = this_cpu_read(trace_buffered_event))) { + (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) { + preempt_disable_notrace(); /* * Filtering is on, so try to use the per cpu buffer first. * This buffer will simulate a ring_buffer_event, @@ -2764,33 +2784,38 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, * is still quicker than no copy on match, but having * to discard out of the ring buffer on a failed match. */ - int max_len = PAGE_SIZE - struct_size(entry, array, 1); + if ((entry = __this_cpu_read(trace_buffered_event))) { + int max_len = PAGE_SIZE - struct_size(entry, array, 1); - val = this_cpu_inc_return(trace_buffered_event_cnt); + val = this_cpu_inc_return(trace_buffered_event_cnt); - /* - * Preemption is disabled, but interrupts and NMIs - * can still come in now. If that happens after - * the above increment, then it will have to go - * back to the old method of allocating the event - * on the ring buffer, and if the filter fails, it - * will have to call ring_buffer_discard_commit() - * to remove it. - * - * Need to also check the unlikely case that the - * length is bigger than the temp buffer size. - * If that happens, then the reserve is pretty much - * guaranteed to fail, as the ring buffer currently - * only allows events less than a page. But that may - * change in the future, so let the ring buffer reserve - * handle the failure in that case. - */ - if (val == 1 && likely(len <= max_len)) { - trace_event_setup(entry, type, trace_ctx); - entry->array[0] = len; - return entry; + /* + * Preemption is disabled, but interrupts and NMIs + * can still come in now. If that happens after + * the above increment, then it will have to go + * back to the old method of allocating the event + * on the ring buffer, and if the filter fails, it + * will have to call ring_buffer_discard_commit() + * to remove it. + * + * Need to also check the unlikely case that the + * length is bigger than the temp buffer size. + * If that happens, then the reserve is pretty much + * guaranteed to fail, as the ring buffer currently + * only allows events less than a page. But that may + * change in the future, so let the ring buffer reserve + * handle the failure in that case. + */ + if (val == 1 && likely(len <= max_len)) { + trace_event_setup(entry, type, trace_ctx); + entry->array[0] = len; + /* Return with preemption disabled */ + return entry; + } + this_cpu_dec(trace_buffered_event_cnt); } - this_cpu_dec(trace_buffered_event_cnt); + /* __trace_buffer_lock_reserve() disables preemption */ + preempt_enable_notrace(); } entry = __trace_buffer_lock_reserve(*current_rb, type, len, @@ -3648,12 +3673,17 @@ static char *trace_iter_expand_format(struct trace_iterator *iter) } /* Returns true if the string is safe to dereference from an event */ -static bool trace_safe_str(struct trace_iterator *iter, const char *str) +static bool trace_safe_str(struct trace_iterator *iter, const char *str, + bool star, int len) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; + /* Ignore strings with no length */ + if (star && !len) + return true; + /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) @@ -3839,7 +3869,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, * instead. See samples/trace_events/trace-events-sample.h * for reference. */ - if (WARN_ONCE(!trace_safe_str(iter, str), + if (WARN_ONCE(!trace_safe_str(iter, str, star, len), "fmt: '%s' current_buffer: '%s'", fmt, show_buffer(&iter->seq))) { int ret; @@ -4183,7 +4213,7 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" + "# / _-----=> irqs-off/BH-disabled\n" "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" @@ -4224,7 +4254,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space); seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); @@ -4834,6 +4864,12 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) return 0; } +static int tracing_mark_open(struct inode *inode, struct file *filp) +{ + stream_open(inode, filp); + return tracing_open_generic_tr(inode, filp); +} + static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -5635,7 +5671,7 @@ static const char readme_msg[] = "\t - a numeric literal: e.g. ms_per_sec=1000,\n" "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n" "\n" - "\t hist trigger aritmethic expressions support addition(+), subtraction(-),\n" + "\t hist trigger arithmetic expressions support addition(+), subtraction(-),\n" "\t multiplication(*) and division(/) operators. An operand can be either a\n" "\t variable reference, field or numeric literal.\n" "\n" @@ -6718,10 +6754,9 @@ waitagain: cnt = PAGE_SIZE - 1; /* reset all but tr, trace, and overruns */ - memset_startat(iter, 0, seq); + trace_iterator_reset(iter); cpumask_clear(iter->started); trace_seq_init(&iter->seq); - iter->pos = -1; trace_event_read_lock(); trace_access_lock(iter->cpu_file); @@ -7110,9 +7145,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tt) event_triggers_post_call(tr->trace_marker_file, tt); - if (written > 0) - *fpos += written; - return written; } @@ -7171,9 +7203,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, __buffer_unlock_commit(buffer, event); - if (written > 0) - *fpos += written; - return written; } @@ -7573,16 +7602,14 @@ static const struct file_operations tracing_free_buffer_fops = { }; static const struct file_operations tracing_mark_fops = { - .open = tracing_open_generic_tr, + .open = tracing_mark_open, .write = tracing_mark_write, - .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_mark_raw_fops = { - .open = tracing_open_generic_tr, + .open = tracing_mark_open, .write = tracing_mark_raw_write, - .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; @@ -7713,7 +7740,7 @@ const struct file_operations trace_min_max_fops = { struct err_info { const char **errs; /* ptr to loc-specific array of err strings */ u8 type; /* index into errs -> specific err string */ - u8 pos; /* MAX_FILTER_STR_VAL = 256 */ + u16 pos; /* caret position */ u64 ts; }; @@ -7721,25 +7748,52 @@ struct tracing_log_err { struct list_head list; struct err_info info; char loc[TRACING_LOG_LOC_MAX]; /* err location */ - char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ + char *cmd; /* what caused err */ }; static DEFINE_MUTEX(tracing_err_log_lock); -static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) +static struct tracing_log_err *alloc_tracing_log_err(int len) +{ + struct tracing_log_err *err; + + err = kzalloc(sizeof(*err), GFP_KERNEL); + if (!err) + return ERR_PTR(-ENOMEM); + + err->cmd = kzalloc(len, GFP_KERNEL); + if (!err->cmd) { + kfree(err); + return ERR_PTR(-ENOMEM); + } + + return err; +} + +static void free_tracing_log_err(struct tracing_log_err *err) +{ + kfree(err->cmd); + kfree(err); +} + +static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, + int len) { struct tracing_log_err *err; if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { - err = kzalloc(sizeof(*err), GFP_KERNEL); - if (!err) - err = ERR_PTR(-ENOMEM); - tr->n_err_log_entries++; + err = alloc_tracing_log_err(len); + if (PTR_ERR(err) != -ENOMEM) + tr->n_err_log_entries++; return err; } err = list_first_entry(&tr->err_log, struct tracing_log_err, list); + kfree(err->cmd); + err->cmd = kzalloc(len, GFP_KERNEL); + if (!err->cmd) + return ERR_PTR(-ENOMEM); list_del(&err->list); return err; @@ -7800,22 +7854,25 @@ unsigned int err_pos(char *cmd, const char *str) */ void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, - const char **errs, u8 type, u8 pos) + const char **errs, u8 type, u16 pos) { struct tracing_log_err *err; + int len = 0; if (!tr) tr = &global_trace; + len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; + mutex_lock(&tracing_err_log_lock); - err = get_tracing_log_err(tr); + err = get_tracing_log_err(tr, len); if (PTR_ERR(err) == -ENOMEM) { mutex_unlock(&tracing_err_log_lock); return; } snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); - snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd); + snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); err->info.errs = errs; err->info.type = type; @@ -7833,7 +7890,7 @@ static void clear_tracing_err_log(struct trace_array *tr) mutex_lock(&tracing_err_log_lock); list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); - kfree(err); + free_tracing_log_err(err); } tr->n_err_log_entries = 0; @@ -7861,9 +7918,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v) mutex_unlock(&tracing_err_log_lock); } -static void tracing_err_log_show_pos(struct seq_file *m, u8 pos) +static void tracing_err_log_show_pos(struct seq_file *m, u16 pos) { - u8 i; + u16 i; for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) seq_putc(m, ' '); @@ -10109,6 +10166,14 @@ out: return ret; } +void __init ftrace_boot_snapshot(void) +{ + if (snapshot_at_boot) { + tracing_snapshot(); + internal_trace_puts("** Boot snapshot taken **\n"); + } +} + void __init early_trace_init(void) { if (tracepoint_printk) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 38715aa6cfdf..07d990270e2a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -83,6 +83,9 @@ enum trace_type { #undef __dynamic_array #define __dynamic_array(type, item) type item[]; +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item) type item[]; + #undef F_STRUCT #define F_STRUCT(args...) args @@ -133,7 +136,6 @@ struct kprobe_trace_entry_head { struct eprobe_trace_entry_head { struct trace_entry ent; - unsigned int type; }; struct kretprobe_trace_entry_head { @@ -1334,10 +1336,12 @@ __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { if (this_cpu_read(trace_buffered_event) == event) { - /* Simply release the temp buffer */ + /* Simply release the temp buffer and enable preemption */ this_cpu_dec(trace_buffered_event_cnt); + preempt_enable_notrace(); return; } + /* ring_buffer_discard_commit() enables preemption */ ring_buffer_discard_commit(buffer, event); } @@ -1465,6 +1469,7 @@ struct filter_pred { static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING || field->filter_type == FILTER_STATIC_STRING || field->filter_type == FILTER_PTR_STRING || field->filter_type == FILTER_COMM; @@ -1572,15 +1577,13 @@ extern int event_enable_trigger_print(struct seq_file *m, struct event_trigger_data *data); extern void event_enable_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data); -extern int event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param); +extern int event_enable_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); extern int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); extern void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *test, struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); @@ -1606,6 +1609,30 @@ get_named_trigger_data(struct event_trigger_data *data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); +extern bool event_trigger_check_remove(const char *glob); +extern bool event_trigger_empty_param(const char *param); +extern int event_trigger_separate_filter(char *param_and_filter, char **param, + char **filter, bool param_required); +extern struct event_trigger_data * +event_trigger_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data); +extern int event_trigger_parse_num(char *trigger, + struct event_trigger_data *trigger_data); +extern int event_trigger_set_filter(struct event_command *cmd_ops, + struct trace_event_file *file, + char *param, + struct event_trigger_data *trigger_data); +extern void event_trigger_reset_filter(struct event_command *cmd_ops, + struct event_trigger_data *trigger_data); +extern int event_trigger_register(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + char *cmd, + char *trigger, + struct event_trigger_data *trigger_data, + int *n_registered); /** * struct event_trigger_ops - callbacks for trace event triggers @@ -1613,10 +1640,20 @@ extern int register_trigger_hist_enable_disable_cmds(void); * The methods in this structure provide per-event trigger hooks for * various trigger operations. * + * The @init and @free methods are used during trigger setup and + * teardown, typically called from an event_command's @parse() + * function implementation. + * + * The @print method is used to print the trigger spec. + * + * The @trigger method is the function that actually implements the + * trigger and is called in the context of the triggering event + * whenever that event occurs. + * * All the methods below, except for @init() and @free(), must be * implemented. * - * @func: The trigger 'probe' function called when the triggering + * @trigger: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that * registered the trigger (see struct event_command) along with @@ -1645,9 +1682,10 @@ extern int register_trigger_hist_enable_disable_cmds(void); * (see trace_event_triggers.c). */ struct event_trigger_ops { - void (*func)(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *rbe); + void (*trigger)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, @@ -1696,7 +1734,7 @@ struct event_trigger_ops { * All the methods below, except for @set_filter() and @unreg_all(), * must be implemented. * - * @func: The callback function responsible for parsing and + * @parse: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the * user. It allocates the trigger instance and registers it with * the appropriate trace event. It makes use of the other @@ -1731,21 +1769,24 @@ struct event_trigger_ops { * * @get_trigger_ops: The callback function invoked to retrieve the * event_trigger_ops implementation associated with the command. + * This callback function allows a single event_command to + * support multiple trigger implementations via different sets of + * event_trigger_ops, depending on the value of the @param + * string. */ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; int flags; - int (*func)(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *params); + int (*parse)(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, + char *param_and_filter); int (*reg)(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg)(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg_all)(struct trace_event_file *file); @@ -1836,7 +1877,7 @@ extern ssize_t trace_parse_run_command(struct file *file, extern unsigned int err_pos(char *cmd, const char *str); extern void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, - const char **errs, u8 type, u8 pos); + const char **errs, u8 type, u16 pos); /* * Normal trace_printk() and friends allocates special buffers @@ -1926,14 +1967,7 @@ extern struct trace_iterator *tracepoint_print_iter; */ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) { - const size_t offset = offsetof(struct trace_iterator, seq); - - /* - * Keep gcc from complaining about overwriting more than just one - * member in the structure. - */ - memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); - + memset_startat(iter, 0, seq); iter->pos = -1; } diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 928867f527e7..541aa13581b9 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -242,7 +242,6 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) static int eprobe_event_define_fields(struct trace_event_call *event_call) { - int ret; struct eprobe_trace_entry_head field; struct trace_probe *tp; @@ -250,8 +249,6 @@ static int eprobe_event_define_fields(struct trace_event_call *event_call) if (WARN_ON_ONCE(!tp)) return -ENOENT; - DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0); - return traceprobe_define_arg_fields(event_call, sizeof(field), tp); } @@ -270,7 +267,9 @@ print_eprobe_event(struct trace_iterator *iter, int flags, struct trace_event_call *pevent; struct trace_event *probed_event; struct trace_seq *s = &iter->seq; + struct trace_eprobe *ep; struct trace_probe *tp; + unsigned int type; field = (struct eprobe_trace_entry_head *)iter->ent; tp = trace_probe_primary_from_call( @@ -278,15 +277,18 @@ print_eprobe_event(struct trace_iterator *iter, int flags, if (WARN_ON_ONCE(!tp)) goto out; + ep = container_of(tp, struct trace_eprobe, tp); + type = ep->event->event.type; + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - probed_event = ftrace_find_event(field->type); + probed_event = ftrace_find_event(type); if (probed_event) { pevent = container_of(probed_event, struct trace_event_call, event); trace_seq_printf(s, "%s.%s", pevent->class->system, trace_event_name(pevent)); } else { - trace_seq_printf(s, "%u", field->type); + trace_seq_printf(s, "%u", type); } trace_seq_putc(s, ')'); @@ -489,25 +491,15 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) if (trace_trigger_soft_disabled(edata->file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = edata->file; - dsize = get_eprobe_size(&edata->ep->tp, rec); - fbuffer.regs = NULL; - - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file, - call->event.type, - sizeof(*entry) + edata->ep->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + + entry = trace_event_buffer_reserve(&fbuffer, edata->file, + sizeof(*entry) + edata->ep->tp.size + dsize); + + if (!entry) return; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); - if (edata->ep->event) - entry->type = edata->ep->event->event.type; - else - entry->type = 0; store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); @@ -549,29 +541,29 @@ static void eprobe_trigger_func(struct event_trigger_data *data, } static struct event_trigger_ops eprobe_trigger_ops = { - .func = eprobe_trigger_func, + .trigger = eprobe_trigger_func, .print = eprobe_trigger_print, .init = eprobe_trigger_init, .free = eprobe_trigger_free, }; -static int eprobe_trigger_cmd_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { return -1; } -static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +static int eprobe_trigger_reg_func(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) { return -1; } -static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +static void eprobe_trigger_unreg_func(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) { } @@ -586,7 +578,7 @@ static struct event_command event_trigger_cmd = { .name = "eprobe", .trigger_type = ETT_EVENT_EPROBE, .flags = EVENT_CMD_FL_NEEDS_REC, - .func = eprobe_trigger_cmd_func, + .parse = eprobe_trigger_cmd_parse, .reg = eprobe_trigger_reg_func, .unreg = eprobe_trigger_unreg_func, .unreg_all = NULL, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 92be9cb1d7d4..e11e167b7809 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -40,6 +40,14 @@ static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); static bool eventdir_initialized; +static LIST_HEAD(module_strings); + +struct module_string { + struct list_head next; + struct module *module; + char *str; +}; + #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) static struct kmem_cache *field_cachep; @@ -384,6 +392,12 @@ static void test_event_printk(struct trace_event_call *call) if (!(dereference_flags & (1ULL << arg))) goto next_arg; + /* Check for __get_sockaddr */; + if (str_has_prefix(fmt + i, "__get_sockaddr(")) { + dereference_flags &= ~(1ULL << arg); + goto next_arg; + } + /* Find the REC-> in the argument */ c = strchr(fmt + i, ','); r = strstr(fmt + i, "REC->"); @@ -759,7 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; @@ -783,7 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; @@ -2633,6 +2651,76 @@ static void update_event_printk(struct trace_event_call *call, } } +static void add_str_to_module(struct module *module, char *str) +{ + struct module_string *modstr; + + modstr = kmalloc(sizeof(*modstr), GFP_KERNEL); + + /* + * If we failed to allocate memory here, then we'll just + * let the str memory leak when the module is removed. + * If this fails to allocate, there's worse problems than + * a leaked string on module removal. + */ + if (WARN_ON_ONCE(!modstr)) + return; + + modstr->module = module; + modstr->str = str; + + list_add(&modstr->next, &module_strings); +} + +static void update_event_fields(struct trace_event_call *call, + struct trace_eval_map *map) +{ + struct ftrace_event_field *field; + struct list_head *head; + char *ptr; + char *str; + int len = strlen(map->eval_string); + + /* Dynamic events should never have field maps */ + if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC)) + return; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + ptr = strchr(field->type, '['); + if (!ptr) + continue; + ptr++; + + if (!isalpha(*ptr) && *ptr != '_') + continue; + + if (strncmp(map->eval_string, ptr, len) != 0) + continue; + + str = kstrdup(field->type, GFP_KERNEL); + if (WARN_ON_ONCE(!str)) + return; + ptr = str + (ptr - field->type); + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ + if (WARN_ON_ONCE(!ptr)) { + kfree(str); + continue; + } + + /* + * If the event is part of a module, then we need to free the string + * when the module is removed. Otherwise, it will stay allocated + * until a reboot. + */ + if (call->module) + add_str_to_module(call->module, str); + + field->type = str; + } +} + void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; @@ -2668,6 +2756,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) first = false; } update_event_printk(call, map[i]); + update_event_fields(call, map[i]); } } } @@ -2758,6 +2847,7 @@ int trace_add_event_call(struct trace_event_call *call) mutex_unlock(&trace_types_lock); return ret; } +EXPORT_SYMBOL_GPL(trace_add_event_call); /* * Must be called under locking of trace_types_lock, event_mutex and @@ -2819,6 +2909,7 @@ int trace_remove_event_call(struct trace_event_call *call) return ret; } +EXPORT_SYMBOL_GPL(trace_remove_event_call); #define for_each_event(event, start, end) \ for (event = start; \ @@ -2853,6 +2944,7 @@ static void trace_module_add_events(struct module *mod) static void trace_module_remove_events(struct module *mod) { struct trace_event_call *call, *p; + struct module_string *modstr, *m; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { @@ -2861,6 +2953,14 @@ static void trace_module_remove_events(struct module *mod) if (call->module == mod) __trace_remove_event_call(call); } + /* Check for any strings allocade for this module */ + list_for_each_entry_safe(modstr, m, &module_strings, next) { + if (modstr->module != mod) + continue; + list_del(&modstr->next); + kfree(modstr->str); + kfree(modstr); + } up_write(&trace_event_sem); /* @@ -3461,10 +3561,8 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); - if (!entry) { - pr_warn("Could not create tracefs 'enable' entry\n"); + if (!entry) return -ENOMEM; - } /* There are not as crucial, just warn if they are not created */ @@ -3480,17 +3578,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); /* ring buffer internal formats */ - entry = trace_create_file("header_page", TRACE_MODE_READ, d_events, + trace_create_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); - if (!entry) - pr_warn("Could not create tracefs 'header_page' entry\n"); - entry = trace_create_file("header_event", TRACE_MODE_READ, d_events, + trace_create_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); - if (!entry) - pr_warn("Could not create tracefs 'header_event' entry\n"); tr->event_dir = d_events; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c9124038b140..b458a9afa2c0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -5,6 +5,7 @@ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ +#include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> @@ -654,6 +655,52 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); +/* user space strings temp buffer */ +#define USTRING_BUF_SIZE 1024 + +struct ustring_buffer { + char buffer[USTRING_BUF_SIZE]; +}; + +static __percpu struct ustring_buffer *ustring_per_cpu; + +static __always_inline char *test_string(char *str) +{ + struct ustring_buffer *ubuf; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* For safety, do not trust the string pointer */ + if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + return NULL; + return kstr; +} + +static __always_inline char *test_ustring(char *str) +{ + struct ustring_buffer *ubuf; + char __user *ustr; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* user space address? */ + ustr = (char __user *)str; + if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + return NULL; + + return kstr; +} + /* Filter predicate for fixed sized arrays of characters */ static int filter_pred_string(struct filter_pred *pred, void *event) { @@ -667,19 +714,43 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } -/* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event) +static __always_inline int filter_pchar(struct filter_pred *pred, char *str) { - char **addr = (char **)(event + pred->offset); int cmp, match; - int len = strlen(*addr) + 1; /* including tailing '\0' */ + int len; - cmp = pred->regex.match(*addr, &pred->regex, len); + len = strlen(str) + 1; /* including tailing '\0' */ + cmp = pred->regex.match(str, &pred->regex, len); match = cmp ^ pred->not; return match; } +/* Filter predicate for char * pointers */ +static int filter_pred_pchar(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + char *str; + + str = test_string(*addr); + if (!str) + return 0; + + return filter_pchar(pred, str); +} + +/* Filter predicate for char * pointers in user space*/ +static int filter_pred_pchar_user(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + char *str; + + str = test_ustring(*addr); + if (!str) + return 0; + + return filter_pchar(pred, str); +} /* * Filter predicate for dynamic sized arrays of characters. @@ -706,6 +777,29 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) return match; } +/* + * Filter predicate for relative dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry as same as dynamic string. + * The difference is that the relative one records the location offset + * from the field itself, not the event entry. + */ +static int filter_pred_strrelloc(struct filter_pred *pred, void *event) +{ + u32 *item = (u32 *)(event + pred->offset); + u32 str_item = *item; + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; + char *addr = (char *)(&item[1]) + str_loc; + int cmp, match; + + cmp = pred->regex.match(addr, &pred->regex, str_len); + + match = cmp ^ pred->not; + + return match; +} + /* Filter predicate for CPUs. */ static int filter_pred_cpu(struct filter_pred *pred, void *event) { @@ -756,7 +850,7 @@ static int filter_pred_none(struct filter_pred *pred, void *event) * * Note: * - @str might not be NULL-terminated if it's of type DYN_STRING - * or STATIC_STRING, unless @len is zero. + * RDYN_STRING, or STATIC_STRING, unless @len is zero. */ static int regex_match_full(char *str, struct regex *r, int len) @@ -1083,6 +1177,9 @@ int filter_assign_type(const char *type) if (strstr(type, "__data_loc") && strstr(type, "char")) return FILTER_DYN_STRING; + if (strstr(type, "__rel_loc") && strstr(type, "char")) + return FILTER_RDYN_STRING; + if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; @@ -1158,6 +1255,7 @@ static int parse_pred(const char *str, void *data, struct filter_pred *pred = NULL; char num_buf[24]; /* Big enough to hold an address */ char *field_name; + bool ustring = false; char q; u64 val; int len; @@ -1192,6 +1290,12 @@ static int parse_pred(const char *str, void *data, return -EINVAL; } + /* See if the field is a user space string */ + if ((len = str_has_prefix(str + i, ".ustring"))) { + ustring = true; + i += len; + } + while (isspace(str[i])) i++; @@ -1318,10 +1422,24 @@ static int parse_pred(const char *str, void *data, pred->fn = filter_pred_string; pred->regex.field_len = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) + } else if (field->filter_type == FILTER_DYN_STRING) { pred->fn = filter_pred_strloc; - else - pred->fn = filter_pred_pchar; + } else if (field->filter_type == FILTER_RDYN_STRING) + pred->fn = filter_pred_strrelloc; + else { + + if (!ustring_per_cpu) { + /* Once allocated, keep it around for good */ + ustring_per_cpu = alloc_percpu(struct ustring_buffer); + if (!ustring_per_cpu) + goto err_mem; + } + + if (ustring) + pred->fn = filter_pred_pchar_user; + else + pred->fn = filter_pred_pchar; + } /* go past the last quote */ i++; @@ -1387,6 +1505,9 @@ static int parse_pred(const char *str, void *data, err_free: kfree(pred); return -EINVAL; +err_mem: + kfree(pred); + return -ENOMEM; } enum { diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 319f9c8ca7e7..44db5ba9cabb 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -217,6 +217,20 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, return (u64)(unsigned long)addr; } +static u64 hist_field_reldynstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + u32 *item = event + hist_field->field->offset; + u32 str_item = *item; + int str_loc = str_item & 0xffff; + char *addr = (char *)&item[1] + str_loc; + + return (u64)(unsigned long)addr; +} + static u64 hist_field_pstring(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -713,11 +727,16 @@ static struct track_data *track_data_alloc(unsigned int key_len, return data; } -static char last_cmd[MAX_FILTER_STR_VAL]; +#define HIST_PREFIX "hist:" + +static char *last_cmd; static char last_cmd_loc[MAX_FILTER_STR_VAL]; static int errpos(char *str) { + if (!str || !last_cmd) + return 0; + return err_pos(last_cmd, str); } @@ -725,12 +744,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str) { const char *system = NULL, *name = NULL; struct trace_event_call *call; + int len; if (!str) return; - strcpy(last_cmd, "hist:"); - strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:")); + /* sizeof() contains the nul byte */ + len = sizeof(HIST_PREFIX) + strlen(str); + kfree(last_cmd); + last_cmd = kzalloc(len, GFP_KERNEL); + if (!last_cmd) + return; + + strcpy(last_cmd, HIST_PREFIX); + /* Again, sizeof() contains the nul byte */ + len -= sizeof(HIST_PREFIX); + strncat(last_cmd, str, len); if (file) { call = file->event_call; @@ -743,18 +772,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str) } if (system) - snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); + snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name); } -static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) +static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos) { + if (!last_cmd) + return; + tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, err_type, err_pos); } static void hist_err_clear(void) { - last_cmd[0] = '\0'; + if (last_cmd) + last_cmd[0] = '\0'; last_cmd_loc[0] = '\0'; } @@ -1956,8 +1989,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (field->filter_type == FILTER_STATIC_STRING) { hist_field->fn = hist_field_string; hist_field->size = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) + } else if (field->filter_type == FILTER_DYN_STRING) { hist_field->fn = hist_field_dynstring; + } else if (field->filter_type == FILTER_RDYN_STRING) + hist_field->fn = hist_field_reldynstring; else hist_field->fn = hist_field_pstring; } else { @@ -2273,9 +2308,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, /* * For backward compatibility, if field_name * was "cpu", then we treat this the same as - * common_cpu. + * common_cpu. This also works for "CPU". */ - if (strcmp(field_name, "cpu") == 0) { + if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, @@ -2487,6 +2522,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); expr->fn = hist_field_unary_minus; expr->operands[0] = operand1; + expr->size = operand1->size; + expr->is_signed = operand1->is_signed; expr->operator = FIELD_OP_UNARY_MINUS; expr->name = expr_str(expr, 0); expr->type = kstrdup_const(operand1->type, GFP_KERNEL); @@ -2703,6 +2740,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, /* The operand sizes should be the same, so just pick one */ expr->size = operand1->size; + expr->is_signed = operand1->is_signed; expr->operator = field_op; expr->type = kstrdup_const(operand1->type, GFP_KERNEL); @@ -2745,9 +2783,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, } static struct event_command trigger_hist_cmd; -static int event_hist_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param); +static int event_hist_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); static bool compatible_keys(struct hist_trigger_data *target_hist_data, struct hist_trigger_data *hist_data, @@ -2950,8 +2988,8 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, var_hist->hist_data = hist_data; /* Create the new histogram with our variable */ - ret = event_hist_trigger_func(&trigger_hist_cmd, file, - "", "hist", cmd); + ret = event_hist_trigger_parse(&trigger_hist_cmd, file, + "", "hist", cmd); if (ret) { kfree(cmd); kfree(var_hist->cmd); @@ -3919,6 +3957,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data, var_ref_idx = find_var_ref_idx(hist_data, var_ref); if (WARN_ON(var_ref_idx < 0)) { + kfree(p); ret = var_ref_idx; goto err; } @@ -4812,7 +4851,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) cmp_fn = tracing_map_cmp_none; - else if (!field) + else if (!field || hist_field->flags & HIST_FIELD_FL_CPU) cmp_fn = tracing_map_cmp_num(hist_field->size, hist_field->is_signed); else if (is_string_field(field)) @@ -4961,7 +5000,8 @@ static inline void add_to_key(char *compound_key, void *key, struct ftrace_event_field *field; field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING) + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) size = *(u32 *)(rec + field->offset) >> 16; else if (field->filter_type == FILTER_STATIC_STRING) size = field->size; @@ -5589,7 +5629,7 @@ static int event_hist_trigger_print(struct seq_file *m, bool have_var = false; unsigned int i; - seq_puts(m, "hist:"); + seq_puts(m, HIST_PREFIX); if (data->name) seq_printf(m, "%s:", data->name); @@ -5712,8 +5752,8 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->n_field_var_hists; i++) { file = hist_data->field_var_hists[i]->hist_data->event_file; cmd = hist_data->field_var_hists[i]->cmd; - ret = event_hist_trigger_func(&trigger_hist_cmd, file, - "!hist", "hist", cmd); + ret = event_hist_trigger_parse(&trigger_hist_cmd, file, + "!hist", "hist", cmd); WARN_ON_ONCE(ret < 0); } } @@ -5742,7 +5782,7 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_hist_trigger_ops = { - .func = event_hist_trigger, + .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_init, .free = event_hist_trigger_free, @@ -5776,7 +5816,7 @@ static void event_hist_trigger_named_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_hist_trigger_named_ops = { - .func = event_hist_trigger, + .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_named_init, .free = event_hist_trigger_named_free, @@ -5893,7 +5933,7 @@ static bool hist_trigger_match(struct event_trigger_data *data, return true; } -static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, +static int hist_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -6045,7 +6085,7 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data, return false; } -static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, +static void hist_unregister_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -6129,9 +6169,9 @@ static void hist_unreg_all(struct trace_event_file *file) } } -static int event_hist_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +static int event_hist_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; struct event_trigger_data *trigger_data; @@ -6146,7 +6186,9 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, lockdep_assert_held(&event_mutex); - if (glob && strlen(glob)) { + WARN_ON(!glob); + + if (strlen(glob)) { hist_err_clear(); last_cmd_set(file, param); } @@ -6179,7 +6221,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, continue; } break; - } while (p); + } while (1); if (!p) param = NULL; @@ -6245,7 +6287,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, goto out_free; } - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) @@ -6254,7 +6296,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, goto out_free; } - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of triggers registered, * but if it didn't register any it returns zero. Consider no @@ -6297,7 +6339,7 @@ enable: return ret; out_unreg: - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); @@ -6314,7 +6356,7 @@ static struct event_command trigger_hist_cmd = { .name = "hist", .trigger_type = ETT_EVENT_HIST, .flags = EVENT_CMD_FL_NEEDS_REC, - .func = event_hist_trigger_func, + .parse = event_hist_trigger_parse, .reg = hist_register_trigger, .unreg = hist_unregister_trigger, .unreg_all = hist_unreg_all, @@ -6366,28 +6408,28 @@ hist_enable_count_trigger(struct event_trigger_data *data, } static struct event_trigger_ops hist_enable_trigger_ops = { - .func = hist_enable_trigger, + .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_enable_count_trigger_ops = { - .func = hist_enable_count_trigger, + .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_disable_trigger_ops = { - .func = hist_enable_trigger, + .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_disable_count_trigger_ops = { - .func = hist_enable_count_trigger, + .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, @@ -6429,7 +6471,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file) static struct event_command trigger_hist_enable_cmd = { .name = ENABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, @@ -6440,7 +6482,7 @@ static struct event_command trigger_hist_enable_cmd = { static struct event_command trigger_hist_disable_cmd = { .name = DISABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index c188045c5f97..d6b4935a78c0 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -168,10 +168,14 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size) continue; if (field->filter_type == FILTER_STATIC_STRING) continue; - if (field->filter_type == FILTER_DYN_STRING) { + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) { u32 *str_item; int str_loc = entry_size & 0xffff; + if (field->filter_type == FILTER_RDYN_STRING) + str_loc -= field->offset + field->size; + str_item = (u32 *)(entry + field->offset); *str_item = str_loc; /* string length is 0. */ } else { @@ -214,7 +218,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) if (field->filter_type == FILTER_STATIC_STRING) { strlcpy(entry + field->offset, addr, field->size); - } else if (field->filter_type == FILTER_DYN_STRING) { + } else if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) { int str_len = strlen(addr) + 1; int str_loc = entry_size & 0xffff; u32 *str_item; @@ -229,6 +234,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) strlcpy(entry + (entry_size - str_len), addr, str_len); str_item = (u32 *)(entry + field->offset); + if (field->filter_type == FILTER_RDYN_STRING) + str_loc -= field->offset + field->size; *str_item = (str_len << 16) | str_loc; } else { char **paddr; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index ca9c13b2ecf4..5e8c07aef071 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -42,10 +42,13 @@ enum { ERRORS }; static const char *err_text[] = { ERRORS }; -static char last_cmd[MAX_FILTER_STR_VAL]; +static char *last_cmd; static int errpos(const char *str) { + if (!str || !last_cmd) + return 0; + return err_pos(last_cmd, str); } @@ -54,11 +57,16 @@ static void last_cmd_set(const char *str) if (!str) return; - strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); + kfree(last_cmd); + + last_cmd = kstrdup(str, GFP_KERNEL); } -static void synth_err(u8 err_type, u8 err_pos) +static void synth_err(u8 err_type, u16 err_pos) { + if (!last_cmd) + return; + tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, err_type, err_pos); } @@ -1979,7 +1987,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_next_val); /** * synth_event_add_val - Add a named field's value to an open synth trace * @field_name: The name of the synthetic event field value to set - * @val: The value to set the next field to + * @val: The value to set the named field to * @trace_state: A pointer to object tracking the piecewise trace state * * Set the value of the named field in an event that's been opened by @@ -2054,6 +2062,13 @@ static int create_synth_event(const char *raw_command) last_cmd_set(raw_command); + name = raw_command; + + /* Don't try to process if not our system */ + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; + p = strpbrk(raw_command, " \t"); if (!p) { synth_err(SYNTH_ERR_INVALID_CMD, 0); @@ -2062,12 +2077,6 @@ static int create_synth_event(const char *raw_command) fields = skip_spaces(p); - name = raw_command; - - if (name[0] != 's' || name[1] != ':') - return -ECANCELED; - name += 2; - /* This interface accepts group name prefix */ if (strchr(name, '/')) { len = str_has_prefix(name, SYNTH_SYSTEM "/"); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 3d5c07239a2a..7eb9d04f1c2e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -68,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, if (data->paused) continue; if (!rec) { - data->ops->func(data, buffer, rec, event); + data->ops->trigger(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -78,12 +78,26 @@ event_triggers_call(struct trace_event_file *file, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, buffer, rec, event); + data->ops->trigger(data, buffer, rec, event); } return tt; } EXPORT_SYMBOL_GPL(event_triggers_call); +bool __trace_trigger_soft_disabled(struct trace_event_file *file) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_MODE) + event_triggers_call(file, NULL, NULL, NULL); + if (eflags & EVENT_FILE_FL_SOFT_DISABLED) + return true; + if (eflags & EVENT_FILE_FL_PID_FILTER) + return trace_event_ignore_this_pid(file); + return false; +} +EXPORT_SYMBOL_GPL(__trace_trigger_soft_disabled); + /** * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event @@ -106,7 +120,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, NULL, NULL, NULL); + data->ops->trigger(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -245,7 +259,7 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) mutex_lock(&trigger_cmd_mutex); list_for_each_entry(p, &trigger_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(p, file, buff, command, next); + ret = p->parse(p, file, buff, command, next); goto out_unlock; } } @@ -540,7 +554,6 @@ void update_cond_flag(struct trace_event_file *file) /** * register_trigger - Generic event_command @reg implementation * @glob: The raw string used to register the trigger - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data to associate with the trigger * @file: The trace_event_file associated with the event * @@ -551,7 +564,7 @@ void update_cond_flag(struct trace_event_file *file) * * Return: 0 on success, errno otherwise */ -static int register_trigger(char *glob, struct event_trigger_ops *ops, +static int register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -589,7 +602,6 @@ out: /** * unregister_trigger - Generic event_command @unreg implementation * @glob: The raw string used to register the trigger - * @ops: The trigger ops associated with the trigger * @test: Trigger-specific data used to find the trigger to remove * @file: The trace_event_file associated with the event * @@ -598,7 +610,7 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -static void unregister_trigger(char *glob, struct event_trigger_ops *ops, +static void unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file) { @@ -621,8 +633,350 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, data->ops->free(data->ops, data); } +/* + * Event trigger parsing helper functions. + * + * These functions help make it easier to write an event trigger + * parsing function i.e. the struct event_command.parse() callback + * function responsible for parsing and registering a trigger command + * written to the 'trigger' file. + * + * A trigger command (or just 'trigger' for short) takes the form: + * [trigger] [if filter] + * + * The struct event_command.parse() callback (and other struct + * event_command functions) refer to several components of a trigger + * command. Those same components are referenced by the event trigger + * parsing helper functions defined below. These components are: + * + * cmd - the trigger command name + * glob - the trigger command name optionally prefaced with '!' + * param_and_filter - text following cmd and ':' + * param - text following cmd and ':' and stripped of filter + * filter - the optional filter text following (and including) 'if' + * + * To illustrate the use of these componenents, here are some concrete + * examples. For the following triggers: + * + * echo 'traceon:5 if pid == 0' > trigger + * - 'traceon' is both cmd and glob + * - '5 if pid == 0' is the param_and_filter + * - '5' is the param + * - 'if pid == 0' is the filter + * + * echo 'enable_event:sys:event:n' > trigger + * - 'enable_event' is both cmd and glob + * - 'sys:event:n' is the param_and_filter + * - 'sys:event:n' is the param + * - there is no filter + * + * echo 'hist:keys=pid if prio > 50' > trigger + * - 'hist' is both cmd and glob + * - 'keys=pid if prio > 50' is the param_and_filter + * - 'keys=pid' is the param + * - 'if prio > 50' is the filter + * + * echo '!enable_event:sys:event:n' > trigger + * - 'enable_event' the cmd + * - '!enable_event' is the glob + * - 'sys:event:n' is the param_and_filter + * - 'sys:event:n' is the param + * - there is no filter + * + * echo 'traceoff' > trigger + * - 'traceoff' is both cmd and glob + * - there is no param_and_filter + * - there is no param + * - there is no filter + * + * There are a few different categories of event trigger covered by + * these helpers: + * + * - triggers that don't require a parameter e.g. traceon + * - triggers that do require a parameter e.g. enable_event and hist + * - triggers that though they may not require a param may support an + * optional 'n' param (n = number of times the trigger should fire) + * e.g.: traceon:5 or enable_event:sys:event:n + * - triggers that do not support an 'n' param e.g. hist + * + * These functions can be used or ignored as necessary - it all + * depends on the complexity of the trigger, and the granularity of + * the functions supported reflects the fact that some implementations + * may need to customize certain aspects of their implementations and + * won't need certain functions. For instance, the hist trigger + * implementation doesn't use event_trigger_separate_filter() because + * it has special requirements for handling the filter. + */ + +/** + * event_trigger_check_remove - check whether an event trigger specifies remove + * @glob: The trigger command string, with optional remove(!) operator + * + * The event trigger callback implementations pass in 'glob' as a + * parameter. This is the command name either with or without a + * remove(!) operator. This function simply parses the glob and + * determines whether the command corresponds to a trigger removal or + * a trigger addition. + * + * Return: true if this is a remove command, false otherwise + */ +bool event_trigger_check_remove(const char *glob) +{ + return (glob && glob[0] == '!') ? true : false; +} + +/** + * event_trigger_empty_param - check whether the param is empty + * @param: The trigger param string + * + * The event trigger callback implementations pass in 'param' as a + * parameter. This corresponds to the string following the command + * name minus the command name. This function can be called by a + * callback implementation for any command that requires a param; a + * callback that doesn't require a param can ignore it. + * + * Return: true if this is an empty param, false otherwise + */ +bool event_trigger_empty_param(const char *param) +{ + return !param; +} + +/** + * event_trigger_separate_filter - separate an event trigger from a filter + * @param: The param string containing trigger and possibly filter + * @trigger: outparam, will be filled with a pointer to the trigger + * @filter: outparam, will be filled with a pointer to the filter + * @param_required: Specifies whether or not the param string is required + * + * Given a param string of the form '[trigger] [if filter]', this + * function separates the filter from the trigger and returns the + * trigger in *trigger and the filter in *filter. Either the *trigger + * or the *filter may be set to NULL by this function - if not set to + * NULL, they will contain strings corresponding to the trigger and + * filter. + * + * There are two cases that need to be handled with respect to the + * passed-in param: either the param is required, or it is not + * required. If @param_required is set, and there's no param, it will + * return -EINVAL. If @param_required is not set and there's a param + * that starts with a number, that corresponds to the case of a + * trigger with :n (n = number of times the trigger should fire) and + * the parsing continues normally; otherwise the function just returns + * and assumes param just contains a filter and there's nothing else + * to do. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_separate_filter(char *param_and_filter, char **param, + char **filter, bool param_required) +{ + int ret = 0; + + *param = *filter = NULL; + + if (!param_and_filter) { + if (param_required) + ret = -EINVAL; + goto out; + } + + /* + * Here we check for an optional param. The only legal + * optional param is :n, and if that's the case, continue + * below. Otherwise we assume what's left is a filter and + * return it as the filter string for the caller to deal with. + */ + if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) { + *filter = param_and_filter; + goto out; + } + + /* + * Separate the param from the filter (param [if filter]). + * Here we have either an optional :n param or a required + * param and an optional filter. + */ + *param = strsep(¶m_and_filter, " \t"); + + /* + * Here we have a filter, though it may be empty. + */ + if (param_and_filter) { + *filter = skip_spaces(param_and_filter); + if (!**filter) + *filter = NULL; + } +out: + return ret; +} + +/** + * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * @cmd_ops: The event_command operations for the trigger + * @cmd: The cmd string + * @param: The param string + * @private_data: User data to associate with the event trigger + * + * Allocate an event_trigger_data instance and initialize it. The + * @cmd_ops are used along with the @cmd and @param to get the + * trigger_ops to assign to the event_trigger_data. @private_data can + * also be passed in and associated with the event_trigger_data. + * + * Use event_trigger_free() to free an event_trigger_data object. + * + * Return: The trigger_data object success, NULL otherwise + */ +struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) +{ + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + + trigger_ops = cmd_ops->get_trigger_ops(cmd, param); + + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + return NULL; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + trigger_data->private_data = private_data; + + INIT_LIST_HEAD(&trigger_data->list); + INIT_LIST_HEAD(&trigger_data->named_list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + return trigger_data; +} + +/** + * event_trigger_parse_num - parse and return the number param for a trigger + * @param: The param string + * @trigger_data: The trigger_data for the trigger + * + * Parse the :n (n = number of times the trigger should fire) param + * and set the count variable in the trigger_data to the parsed count. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_parse_num(char *param, + struct event_trigger_data *trigger_data) +{ + char *number; + int ret = 0; + + if (param) { + number = strsep(¶m, ":"); + + if (!strlen(number)) + return -EINVAL; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + } + + return ret; +} + +/** + * event_trigger_set_filter - set an event trigger's filter + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @param: The string containing the filter + * @trigger_data: The trigger_data for the trigger + * + * Set the filter for the trigger. If the filter is NULL, just return + * without error. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_set_filter(struct event_command *cmd_ops, + struct trace_event_file *file, + char *param, + struct event_trigger_data *trigger_data) +{ + if (param && cmd_ops->set_filter) + return cmd_ops->set_filter(param, trigger_data, file); + + return 0; +} + +/** + * event_trigger_reset_filter - reset an event trigger's filter + * @cmd_ops: The event_command operations for the trigger + * @trigger_data: The trigger_data for the trigger + * + * Reset the filter for the trigger to no filter. + */ +void event_trigger_reset_filter(struct event_command *cmd_ops, + struct event_trigger_data *trigger_data) +{ + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); +} + /** - * event_trigger_callback - Generic event_command @func implementation + * event_trigger_register - register an event trigger + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @glob: The trigger command string, with optional remove(!) operator + * @cmd: The cmd string + * @param: The param string + * @trigger_data: The trigger_data for the trigger + * @n_registered: optional outparam, the number of triggers registered + * + * Register an event trigger. The @cmd_ops are used to call the + * cmd_ops->reg() function which actually does the registration. The + * cmd_ops->reg() function returns the number of triggers registered, + * which is assigned to n_registered, if n_registered is non-NULL. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_register(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + char *cmd, + char *param, + struct event_trigger_data *trigger_data, + int *n_registered) +{ + int ret; + + if (n_registered) + *n_registered = 0; + + ret = cmd_ops->reg(glob, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + cmd_ops->unreg(glob, trigger_data, file); + ret = -ENOENT; + } else if (ret > 0) { + if (n_registered) + *n_registered = ret; + /* Just return zero, not the number of enabled functions */ + ret = 0; + } + + return ret; +} + +/* + * End event trigger parsing helper functions. + */ + +/** + * event_trigger_parse - Generic event_command @parse implementation * @cmd_ops: The command ops, used for trigger registration * @file: The trace_event_file associated with the event * @glob: The raw string used to register the trigger @@ -632,15 +986,15 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, * Common implementation for event command parsing and trigger * instantiation. * - * Usually used directly as the @func method in event command + * Usually used directly as the @parse method in event command * implementations. * * Return: 0 on success, errno otherwise */ static int -event_trigger_callback(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +event_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct event_trigger_data *trigger_data; struct event_trigger_ops *trigger_ops; @@ -673,7 +1027,7 @@ event_trigger_callback(struct event_command *cmd_ops, INIT_LIST_HEAD(&trigger_data->named_list); if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); kfree(trigger_data); ret = 0; goto out; @@ -708,14 +1062,14 @@ event_trigger_callback(struct event_command *cmd_ops, out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ event_trigger_init(trigger_ops, trigger_data); - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ if (!ret) { - cmd_ops->unreg(glob, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob, trigger_data, file); ret = -ENOENT; } else if (ret > 0) ret = 0; @@ -955,6 +1309,16 @@ traceon_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { + struct trace_event_file *file = data->private_data; + + if (file) { + if (tracer_tracing_is_on(file->tr)) + return; + + tracer_tracing_on(file->tr); + return; + } + if (tracing_is_on()) return; @@ -966,8 +1330,15 @@ traceon_count_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - if (tracing_is_on()) - return; + struct trace_event_file *file = data->private_data; + + if (file) { + if (tracer_tracing_is_on(file->tr)) + return; + } else { + if (tracing_is_on()) + return; + } if (!data->count) return; @@ -975,7 +1346,10 @@ traceon_count_trigger(struct event_trigger_data *data, if (data->count != -1) (data->count)--; - tracing_on(); + if (file) + tracer_tracing_on(file->tr); + else + tracing_on(); } static void @@ -983,6 +1357,16 @@ traceoff_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { + struct trace_event_file *file = data->private_data; + + if (file) { + if (!tracer_tracing_is_on(file->tr)) + return; + + tracer_tracing_off(file->tr); + return; + } + if (!tracing_is_on()) return; @@ -994,8 +1378,15 @@ traceoff_count_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - if (!tracing_is_on()) - return; + struct trace_event_file *file = data->private_data; + + if (file) { + if (!tracer_tracing_is_on(file->tr)) + return; + } else { + if (!tracing_is_on()) + return; + } if (!data->count) return; @@ -1003,7 +1394,10 @@ traceoff_count_trigger(struct event_trigger_data *data, if (data->count != -1) (data->count)--; - tracing_off(); + if (file) + tracer_tracing_off(file->tr); + else + tracing_off(); } static int @@ -1023,28 +1417,28 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops traceon_trigger_ops = { - .func = traceon_trigger, + .trigger = traceon_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceon_count_trigger_ops = { - .func = traceon_count_trigger, + .trigger = traceon_count_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceoff_trigger_ops = { - .func = traceoff_trigger, + .trigger = traceoff_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceoff_count_trigger_ops = { - .func = traceoff_count_trigger, + .trigger = traceoff_count_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1069,7 +1463,7 @@ onoff_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_traceon_cmd = { .name = "traceon", .trigger_type = ETT_TRACE_ONOFF, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = onoff_get_trigger_ops, @@ -1080,7 +1474,7 @@ static struct event_command trigger_traceoff_cmd = { .name = "traceoff", .trigger_type = ETT_TRACE_ONOFF, .flags = EVENT_CMD_FL_POST_TRIGGER, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = onoff_get_trigger_ops, @@ -1116,14 +1510,14 @@ snapshot_count_trigger(struct event_trigger_data *data, } static int -register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, +register_snapshot_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { if (tracing_alloc_snapshot_instance(file->tr) != 0) return 0; - return register_trigger(glob, ops, data, file); + return register_trigger(glob, data, file); } static int @@ -1135,14 +1529,14 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops snapshot_trigger_ops = { - .func = snapshot_trigger, + .trigger = snapshot_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops snapshot_count_trigger_ops = { - .func = snapshot_count_trigger, + .trigger = snapshot_count_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1157,7 +1551,7 @@ snapshot_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_snapshot_cmd = { .name = "snapshot", .trigger_type = ETT_SNAPSHOT, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_snapshot_trigger, .unreg = unregister_trigger, .get_trigger_ops = snapshot_get_trigger_ops, @@ -1200,7 +1594,12 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - trace_dump_stack(STACK_SKIP); + struct trace_event_file *file = data->private_data; + + if (file) + __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + else + trace_dump_stack(STACK_SKIP); } static void @@ -1226,14 +1625,14 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops stacktrace_trigger_ops = { - .func = stacktrace_trigger, + .trigger = stacktrace_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops stacktrace_count_trigger_ops = { - .func = stacktrace_count_trigger, + .trigger = stacktrace_count_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1249,7 +1648,7 @@ static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, .flags = EVENT_CMD_FL_POST_TRIGGER, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = stacktrace_get_trigger_ops, @@ -1353,36 +1752,36 @@ void event_enable_trigger_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_enable_trigger_ops = { - .func = event_enable_trigger, + .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_enable_count_trigger_ops = { - .func = event_enable_count_trigger, + .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_disable_trigger_ops = { - .func = event_enable_trigger, + .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_disable_count_trigger_ops = { - .func = event_enable_count_trigger, + .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -int event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +int event_enable_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; @@ -1455,7 +1854,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, trigger_data->private_data = enable_data; if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); kfree(trigger_data); kfree(enable_data); ret = 0; @@ -1502,7 +1901,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, ret = trace_event_enable_disable(event_enable_file, 1, 1); if (ret < 0) goto out_put; - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. @@ -1532,7 +1931,6 @@ int event_enable_trigger_func(struct event_command *cmd_ops, } int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { @@ -1574,7 +1972,6 @@ out: } void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *test, struct trace_event_file *file) { @@ -1628,7 +2025,7 @@ event_enable_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_enable_cmd = { .name = ENABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .get_trigger_ops = event_enable_get_trigger_ops, @@ -1638,7 +2035,7 @@ static struct event_command trigger_enable_cmd = { static struct event_command trigger_disable_cmd = { .name = DISABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .get_trigger_ops = event_enable_get_trigger_ops, diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c new file mode 100644 index 000000000000..8b3d241a31c2 --- /dev/null +++ b/kernel/trace/trace_events_user.c @@ -0,0 +1,1690 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021, Microsoft Corporation. + * + * Authors: + * Beau Belgrave <beaub@linux.microsoft.com> + */ + +#include <linux/bitmap.h> +#include <linux/cdev.h> +#include <linux/hashtable.h> +#include <linux/list.h> +#include <linux/io.h> +#include <linux/uio.h> +#include <linux/ioctl.h> +#include <linux/jhash.h> +#include <linux/trace_events.h> +#include <linux/tracefs.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <uapi/linux/user_events.h> +#include "trace.h" +#include "trace_dynevent.h" + +#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1) + +#define FIELD_DEPTH_TYPE 0 +#define FIELD_DEPTH_NAME 1 +#define FIELD_DEPTH_SIZE 2 + +/* + * Limits how many trace_event calls user processes can create: + * Must be a power of two of PAGE_SIZE. + */ +#define MAX_PAGE_ORDER 0 +#define MAX_PAGES (1 << MAX_PAGE_ORDER) +#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) + +/* Limit how long of an event name plus args within the subsystem. */ +#define MAX_EVENT_DESC 512 +#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define MAX_FIELD_ARRAY_SIZE 1024 +#define MAX_FIELD_ARG_NAME 256 + +#define MAX_BPF_COPY_SIZE PAGE_SIZE +#define MAX_STACK_BPF_DATA 512 + +static char *register_page_data; + +static DEFINE_MUTEX(reg_mutex); +static DEFINE_HASHTABLE(register_table, 4); +static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); + +/* + * Stores per-event properties, as users register events + * within a file a user_event might be created if it does not + * already exist. These are globally used and their lifetime + * is tied to the refcnt member. These cannot go away until the + * refcnt reaches zero. + */ +struct user_event { + struct tracepoint tracepoint; + struct trace_event_call call; + struct trace_event_class class; + struct dyn_event devent; + struct hlist_node node; + struct list_head fields; + struct list_head validators; + atomic_t refcnt; + int index; + int flags; + int min_size; +}; + +/* + * Stores per-file events references, as users register events + * within a file this structure is modified and freed via RCU. + * The lifetime of this struct is tied to the lifetime of the file. + * These are not shared and only accessible by the file that created it. + */ +struct user_event_refs { + struct rcu_head rcu; + int count; + struct user_event *events[]; +}; + +#define VALIDATOR_ENSURE_NULL (1 << 0) +#define VALIDATOR_REL (1 << 1) + +struct user_event_validator { + struct list_head link; + int offset; + int flags; +}; + +typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, + void *tpdata, bool *faulted); + +static int user_event_parse(char *name, char *args, char *flags, + struct user_event **newuser); + +static u32 user_event_key(char *name) +{ + return jhash(name, strlen(name), 0); +} + +static __always_inline __must_check +size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i) +{ + size_t ret; + + pagefault_disable(); + + ret = copy_from_iter_nocache(addr, bytes, i); + + pagefault_enable(); + + return ret; +} + +static struct list_head *user_event_get_fields(struct trace_event_call *call) +{ + struct user_event *user = (struct user_event *)call->data; + + return &user->fields; +} + +/* + * Parses a register command for user_events + * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]] + * + * Example event named 'test' with a 20 char 'msg' field with an unsigned int + * 'id' field after: + * test char[20] msg;unsigned int id + * + * NOTE: Offsets are from the user data perspective, they are not from the + * trace_entry/buffer perspective. We automatically add the common properties + * sizes to the offset for the user. + * + * Upon success user_event has its ref count increased by 1. + */ +static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) +{ + char *name = raw_command; + char *args = strpbrk(name, " "); + char *flags; + + if (args) + *args++ = '\0'; + + flags = strpbrk(name, ":"); + + if (flags) + *flags++ = '\0'; + + return user_event_parse(name, args, flags, newuser); +} + +static int user_field_array_size(const char *type) +{ + const char *start = strchr(type, '['); + char val[8]; + char *bracket; + int size = 0; + + if (start == NULL) + return -EINVAL; + + if (strscpy(val, start + 1, sizeof(val)) <= 0) + return -EINVAL; + + bracket = strchr(val, ']'); + + if (!bracket) + return -EINVAL; + + *bracket = '\0'; + + if (kstrtouint(val, 0, &size)) + return -EINVAL; + + if (size > MAX_FIELD_ARRAY_SIZE) + return -EINVAL; + + return size; +} + +static int user_field_size(const char *type) +{ + /* long is not allowed from a user, since it's ambigious in size */ + if (strcmp(type, "s64") == 0) + return sizeof(s64); + if (strcmp(type, "u64") == 0) + return sizeof(u64); + if (strcmp(type, "s32") == 0) + return sizeof(s32); + if (strcmp(type, "u32") == 0) + return sizeof(u32); + if (strcmp(type, "int") == 0) + return sizeof(int); + if (strcmp(type, "unsigned int") == 0) + return sizeof(unsigned int); + if (strcmp(type, "s16") == 0) + return sizeof(s16); + if (strcmp(type, "u16") == 0) + return sizeof(u16); + if (strcmp(type, "short") == 0) + return sizeof(short); + if (strcmp(type, "unsigned short") == 0) + return sizeof(unsigned short); + if (strcmp(type, "s8") == 0) + return sizeof(s8); + if (strcmp(type, "u8") == 0) + return sizeof(u8); + if (strcmp(type, "char") == 0) + return sizeof(char); + if (strcmp(type, "unsigned char") == 0) + return sizeof(unsigned char); + if (str_has_prefix(type, "char[")) + return user_field_array_size(type); + if (str_has_prefix(type, "unsigned char[")) + return user_field_array_size(type); + if (str_has_prefix(type, "__data_loc ")) + return sizeof(u32); + if (str_has_prefix(type, "__rel_loc ")) + return sizeof(u32); + + /* Uknown basic type, error */ + return -EINVAL; +} + +static void user_event_destroy_validators(struct user_event *user) +{ + struct user_event_validator *validator, *next; + struct list_head *head = &user->validators; + + list_for_each_entry_safe(validator, next, head, link) { + list_del(&validator->link); + kfree(validator); + } +} + +static void user_event_destroy_fields(struct user_event *user) +{ + struct ftrace_event_field *field, *next; + struct list_head *head = &user->fields; + + list_for_each_entry_safe(field, next, head, link) { + list_del(&field->link); + kfree(field); + } +} + +static int user_event_add_field(struct user_event *user, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) +{ + struct user_event_validator *validator; + struct ftrace_event_field *field; + int validator_flags = 0; + + field = kmalloc(sizeof(*field), GFP_KERNEL); + + if (!field) + return -ENOMEM; + + if (str_has_prefix(type, "__data_loc ")) + goto add_validator; + + if (str_has_prefix(type, "__rel_loc ")) { + validator_flags |= VALIDATOR_REL; + goto add_validator; + } + + goto add_field; + +add_validator: + if (strstr(type, "char") != 0) + validator_flags |= VALIDATOR_ENSURE_NULL; + + validator = kmalloc(sizeof(*validator), GFP_KERNEL); + + if (!validator) { + kfree(field); + return -ENOMEM; + } + + validator->flags = validator_flags; + validator->offset = offset; + + /* Want sequential access when validating */ + list_add_tail(&validator->link, &user->validators); + +add_field: + field->type = type; + field->name = name; + field->offset = offset; + field->size = size; + field->is_signed = is_signed; + field->filter_type = filter_type; + + list_add(&field->link, &user->fields); + + /* + * Min size from user writes that are required, this does not include + * the size of trace_entry (common fields). + */ + user->min_size = (offset + size) - sizeof(struct trace_entry); + + return 0; +} + +/* + * Parses the values of a field within the description + * Format: type name [size] + */ +static int user_event_parse_field(char *field, struct user_event *user, + u32 *offset) +{ + char *part, *type, *name; + u32 depth = 0, saved_offset = *offset; + int len, size = -EINVAL; + bool is_struct = false; + + field = skip_spaces(field); + + if (*field == '\0') + return 0; + + /* Handle types that have a space within */ + len = str_has_prefix(field, "unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "struct "); + if (len) { + is_struct = true; + goto skip_next; + } + + len = str_has_prefix(field, "__data_loc unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__data_loc "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__rel_loc unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__rel_loc "); + if (len) + goto skip_next; + + goto parse; +skip_next: + type = field; + field = strpbrk(field + len, " "); + + if (field == NULL) + return -EINVAL; + + *field++ = '\0'; + depth++; +parse: + name = NULL; + + while ((part = strsep(&field, " ")) != NULL) { + switch (depth++) { + case FIELD_DEPTH_TYPE: + type = part; + break; + case FIELD_DEPTH_NAME: + name = part; + break; + case FIELD_DEPTH_SIZE: + if (!is_struct) + return -EINVAL; + + if (kstrtou32(part, 10, &size)) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + if (depth < FIELD_DEPTH_SIZE || !name) + return -EINVAL; + + if (depth == FIELD_DEPTH_SIZE) + size = user_field_size(type); + + if (size == 0) + return -EINVAL; + + if (size < 0) + return size; + + *offset = saved_offset + size; + + return user_event_add_field(user, type, name, saved_offset, size, + type[0] != 'u', FILTER_OTHER); +} + +static void user_event_parse_flags(struct user_event *user, char *flags) +{ + char *flag; + + if (flags == NULL) + return; + + while ((flag = strsep(&flags, ",")) != NULL) { + if (strcmp(flag, "BPF_ITER") == 0) + user->flags |= FLAG_BPF_ITER; + } +} + +static int user_event_parse_fields(struct user_event *user, char *args) +{ + char *field; + u32 offset = sizeof(struct trace_entry); + int ret = -EINVAL; + + if (args == NULL) + return 0; + + while ((field = strsep(&args, ";")) != NULL) { + ret = user_event_parse_field(field, user, &offset); + + if (ret) + break; + } + + return ret; +} + +static struct trace_event_fields user_event_fields_array[1]; + +static const char *user_field_format(const char *type) +{ + if (strcmp(type, "s64") == 0) + return "%lld"; + if (strcmp(type, "u64") == 0) + return "%llu"; + if (strcmp(type, "s32") == 0) + return "%d"; + if (strcmp(type, "u32") == 0) + return "%u"; + if (strcmp(type, "int") == 0) + return "%d"; + if (strcmp(type, "unsigned int") == 0) + return "%u"; + if (strcmp(type, "s16") == 0) + return "%d"; + if (strcmp(type, "u16") == 0) + return "%u"; + if (strcmp(type, "short") == 0) + return "%d"; + if (strcmp(type, "unsigned short") == 0) + return "%u"; + if (strcmp(type, "s8") == 0) + return "%d"; + if (strcmp(type, "u8") == 0) + return "%u"; + if (strcmp(type, "char") == 0) + return "%d"; + if (strcmp(type, "unsigned char") == 0) + return "%u"; + if (strstr(type, "char[") != 0) + return "%s"; + + /* Unknown, likely struct, allowed treat as 64-bit */ + return "%llu"; +} + +static bool user_field_is_dyn_string(const char *type, const char **str_func) +{ + if (str_has_prefix(type, "__data_loc ")) { + *str_func = "__get_str"; + goto check; + } + + if (str_has_prefix(type, "__rel_loc ")) { + *str_func = "__get_rel_str"; + goto check; + } + + return false; +check: + return strstr(type, "char") != 0; +} + +#define LEN_OR_ZERO (len ? len - pos : 0) +static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) +{ + struct ftrace_event_field *field, *next; + struct list_head *head = &user->fields; + int pos = 0, depth = 0; + const char *str_func; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + list_for_each_entry_safe_reverse(field, next, head, link) { + if (depth != 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s", + field->name, user_field_format(field->type)); + + depth++; + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + list_for_each_entry_safe_reverse(field, next, head, link) { + if (user_field_is_dyn_string(field->type, &str_func)) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", %s(%s)", str_func, field->name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", field->name); + } + + return pos + 1; +} +#undef LEN_OR_ZERO + +static int user_event_create_print_fmt(struct user_event *user) +{ + char *print_fmt; + int len; + + len = user_event_set_print_fmt(user, NULL, 0); + + print_fmt = kmalloc(len, GFP_KERNEL); + + if (!print_fmt) + return -ENOMEM; + + user_event_set_print_fmt(user, print_fmt, len); + + user->call.print_fmt = print_fmt; + + return 0; +} + +static enum print_line_t user_event_print_trace(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + /* Unsafe to try to decode user provided print_fmt, use hex */ + trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16, + 1, iter->ent, iter->ent_size, true); + + return trace_handle_return(&iter->seq); +} + +static struct trace_event_functions user_event_funcs = { + .trace = user_event_print_trace, +}; + +static int user_event_set_call_visible(struct user_event *user, bool visible) +{ + int ret; + const struct cred *old_cred; + struct cred *cred; + + cred = prepare_creds(); + + if (!cred) + return -ENOMEM; + + /* + * While by default tracefs is locked down, systems can be configured + * to allow user_event files to be less locked down. The extreme case + * being "other" has read/write access to user_events_data/status. + * + * When not locked down, processes may not have have permissions to + * add/remove calls themselves to tracefs. We need to temporarily + * switch to root file permission to allow for this scenario. + */ + cred->fsuid = GLOBAL_ROOT_UID; + + old_cred = override_creds(cred); + + if (visible) + ret = trace_add_event_call(&user->call); + else + ret = trace_remove_event_call(&user->call); + + revert_creds(old_cred); + put_cred(cred); + + return ret; +} + +static int destroy_user_event(struct user_event *user) +{ + int ret = 0; + + /* Must destroy fields before call removal */ + user_event_destroy_fields(user); + + ret = user_event_set_call_visible(user, false); + + if (ret) + return ret; + + dyn_event_remove(&user->devent); + + register_page_data[user->index] = 0; + clear_bit(user->index, page_bitmap); + hash_del(&user->node); + + user_event_destroy_validators(user); + kfree(user->call.print_fmt); + kfree(EVENT_NAME(user)); + kfree(user); + + return ret; +} + +static struct user_event *find_user_event(char *name, u32 *outkey) +{ + struct user_event *user; + u32 key = user_event_key(name); + + *outkey = key; + + hash_for_each_possible(register_table, user, node, key) + if (!strcmp(EVENT_NAME(user), name)) { + atomic_inc(&user->refcnt); + return user; + } + + return NULL; +} + +static int user_event_validate(struct user_event *user, void *data, int len) +{ + struct list_head *head = &user->validators; + struct user_event_validator *validator; + void *pos, *end = data + len; + u32 loc, offset, size; + + list_for_each_entry(validator, head, link) { + pos = data + validator->offset; + + /* Already done min_size check, no bounds check here */ + loc = *(u32 *)pos; + offset = loc & 0xffff; + size = loc >> 16; + + if (likely(validator->flags & VALIDATOR_REL)) + pos += offset + sizeof(loc); + else + pos = data + offset; + + pos += size; + + if (unlikely(pos > end)) + return -EFAULT; + + if (likely(validator->flags & VALIDATOR_ENSURE_NULL)) + if (unlikely(*(char *)(pos - 1) != '\0')) + return -EFAULT; + } + + return 0; +} + +/* + * Writes the user supplied payload out to a trace file. + */ +static void user_event_ftrace(struct user_event *user, struct iov_iter *i, + void *tpdata, bool *faulted) +{ + struct trace_event_file *file; + struct trace_entry *entry; + struct trace_event_buffer event_buffer; + size_t size = sizeof(*entry) + i->count; + + file = (struct trace_event_file *)tpdata; + + if (!file || + !(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) + return; + + /* Allocates and fills trace_entry, + 1 of this is data payload */ + entry = trace_event_buffer_reserve(&event_buffer, file, size); + + if (unlikely(!entry)) + return; + + if (unlikely(!copy_nofault(entry + 1, i->count, i))) + goto discard; + + if (!list_empty(&user->validators) && + unlikely(user_event_validate(user, entry, size))) + goto discard; + + trace_event_buffer_commit(&event_buffer); + + return; +discard: + *faulted = true; + __trace_event_discard_commit(event_buffer.buffer, + event_buffer.event); +} + +#ifdef CONFIG_PERF_EVENTS +static void user_event_bpf(struct user_event *user, struct iov_iter *i) +{ + struct user_bpf_context context; + struct user_bpf_iter bpf_i; + char fast_data[MAX_STACK_BPF_DATA]; + void *temp = NULL; + + if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) { + /* Raw iterator */ + context.data_type = USER_BPF_DATA_ITER; + context.data_len = i->count; + context.iter = &bpf_i; + + bpf_i.iov_offset = i->iov_offset; + bpf_i.iov = i->iov; + bpf_i.nr_segs = i->nr_segs; + } else if (i->nr_segs == 1 && iter_is_iovec(i)) { + /* Single buffer from user */ + context.data_type = USER_BPF_DATA_USER; + context.data_len = i->count; + context.udata = i->iov->iov_base + i->iov_offset; + } else { + /* Multi buffer from user */ + struct iov_iter copy = *i; + size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE); + + context.data_type = USER_BPF_DATA_KERNEL; + context.kdata = fast_data; + + if (unlikely(copy_size > sizeof(fast_data))) { + temp = kmalloc(copy_size, GFP_NOWAIT); + + if (temp) + context.kdata = temp; + else + copy_size = sizeof(fast_data); + } + + context.data_len = copy_nofault(context.kdata, + copy_size, ©); + } + + trace_call_bpf(&user->call, &context); + + kfree(temp); +} + +/* + * Writes the user supplied payload out to perf ring buffer or eBPF program. + */ +static void user_event_perf(struct user_event *user, struct iov_iter *i, + void *tpdata, bool *faulted) +{ + struct hlist_head *perf_head; + + if (bpf_prog_array_valid(&user->call)) + user_event_bpf(user, i); + + perf_head = this_cpu_ptr(user->call.perf_events); + + if (perf_head && !hlist_empty(perf_head)) { + struct trace_entry *perf_entry; + struct pt_regs *regs; + size_t size = sizeof(*perf_entry) + i->count; + int context; + + perf_entry = perf_trace_buf_alloc(ALIGN(size, 8), + ®s, &context); + + if (unlikely(!perf_entry)) + return; + + perf_fetch_caller_regs(regs); + + if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) + goto discard; + + if (!list_empty(&user->validators) && + unlikely(user_event_validate(user, perf_entry, size))) + goto discard; + + perf_trace_buf_submit(perf_entry, size, context, + user->call.event.type, 1, regs, + perf_head, NULL); + + return; +discard: + *faulted = true; + perf_swevent_put_recursion_context(context); + } +} +#endif + +/* + * Update the register page that is shared between user processes. + */ +static void update_reg_page_for(struct user_event *user) +{ + struct tracepoint *tp = &user->tracepoint; + char status = 0; + + if (atomic_read(&tp->key.enabled) > 0) { + struct tracepoint_func *probe_func_ptr; + user_event_func_t probe_func; + + rcu_read_lock_sched(); + + probe_func_ptr = rcu_dereference_sched(tp->funcs); + + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + + if (probe_func == user_event_ftrace) + status |= EVENT_STATUS_FTRACE; +#ifdef CONFIG_PERF_EVENTS + else if (probe_func == user_event_perf) + status |= EVENT_STATUS_PERF; +#endif + else + status |= EVENT_STATUS_OTHER; + } while ((++probe_func_ptr)->func); + } + + rcu_read_unlock_sched(); + } + + register_page_data[user->index] = status; +} + +/* + * Register callback for our events from tracing sub-systems. + */ +static int user_event_reg(struct trace_event_call *call, + enum trace_reg type, + void *data) +{ + struct user_event *user = (struct user_event *)call->data; + int ret = 0; + + if (!user) + return -ENOENT; + + switch (type) { + case TRACE_REG_REGISTER: + ret = tracepoint_probe_register(call->tp, + call->class->probe, + data); + if (!ret) + goto inc; + break; + + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->probe, + data); + goto dec; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + ret = tracepoint_probe_register(call->tp, + call->class->perf_probe, + data); + if (!ret) + goto inc; + break; + + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->perf_probe, + data); + goto dec; + + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + break; +#endif + } + + return ret; +inc: + atomic_inc(&user->refcnt); + update_reg_page_for(user); + return 0; +dec: + update_reg_page_for(user); + atomic_dec(&user->refcnt); + return 0; +} + +static int user_event_create(const char *raw_command) +{ + struct user_event *user; + char *name; + int ret; + + if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX)) + return -ECANCELED; + + raw_command += USER_EVENTS_PREFIX_LEN; + raw_command = skip_spaces(raw_command); + + name = kstrdup(raw_command, GFP_KERNEL); + + if (!name) + return -ENOMEM; + + mutex_lock(®_mutex); + + ret = user_event_parse_cmd(name, &user); + + if (!ret) + atomic_dec(&user->refcnt); + + mutex_unlock(®_mutex); + + if (ret) + kfree(name); + + return ret; +} + +static int user_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + struct ftrace_event_field *field, *next; + struct list_head *head; + int depth = 0; + + seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user)); + + head = trace_get_fields(&user->call); + + list_for_each_entry_safe_reverse(field, next, head, link) { + if (depth == 0) + seq_puts(m, " "); + else + seq_puts(m, "; "); + + seq_printf(m, "%s %s", field->type, field->name); + + if (str_has_prefix(field->type, "struct ")) + seq_printf(m, " %d", field->size); + + depth++; + } + + seq_puts(m, "\n"); + + return 0; +} + +static bool user_event_is_busy(struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + + return atomic_read(&user->refcnt) != 0; +} + +static int user_event_free(struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + + if (atomic_read(&user->refcnt) != 0) + return -EBUSY; + + return destroy_user_event(user); +} + +static bool user_field_match(struct ftrace_event_field *field, int argc, + const char **argv, int *iout) +{ + char *field_name, *arg_name; + int len, pos, i = *iout; + bool colon = false, match = false; + + if (i >= argc) + return false; + + len = MAX_FIELD_ARG_NAME; + field_name = kmalloc(len, GFP_KERNEL); + arg_name = kmalloc(len, GFP_KERNEL); + + if (!arg_name || !field_name) + goto out; + + pos = 0; + + for (; i < argc; ++i) { + if (i != *iout) + pos += snprintf(arg_name + pos, len - pos, " "); + + pos += snprintf(arg_name + pos, len - pos, argv[i]); + + if (strchr(argv[i], ';')) { + ++i; + colon = true; + break; + } + } + + pos = 0; + + pos += snprintf(field_name + pos, len - pos, field->type); + pos += snprintf(field_name + pos, len - pos, " "); + pos += snprintf(field_name + pos, len - pos, field->name); + + if (colon) + pos += snprintf(field_name + pos, len - pos, ";"); + + *iout = i; + + match = strcmp(arg_name, field_name) == 0; +out: + kfree(arg_name); + kfree(field_name); + + return match; +} + +static bool user_fields_match(struct user_event *user, int argc, + const char **argv) +{ + struct ftrace_event_field *field, *next; + struct list_head *head = &user->fields; + int i = 0; + + list_for_each_entry_safe_reverse(field, next, head, link) + if (!user_field_match(field, argc, argv, &i)) + return false; + + if (i != argc) + return false; + + return true; +} + +static bool user_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + bool match; + + match = strcmp(EVENT_NAME(user), event) == 0 && + (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); + + if (match && argc > 0) + match = user_fields_match(user, argc, argv); + + return match; +} + +static struct dyn_event_operations user_event_dops = { + .create = user_event_create, + .show = user_event_show, + .is_busy = user_event_is_busy, + .free = user_event_free, + .match = user_event_match, +}; + +static int user_event_trace_register(struct user_event *user) +{ + int ret; + + ret = register_trace_event(&user->call.event); + + if (!ret) + return -ENODEV; + + ret = user_event_set_call_visible(user, true); + + if (ret) + unregister_trace_event(&user->call.event); + + return ret; +} + +/* + * Parses the event name, arguments and flags then registers if successful. + * The name buffer lifetime is owned by this method for success cases only. + * Upon success the returned user_event has its ref count increased by 1. + */ +static int user_event_parse(char *name, char *args, char *flags, + struct user_event **newuser) +{ + int ret; + int index; + u32 key; + struct user_event *user; + + /* Prevent dyn_event from racing */ + mutex_lock(&event_mutex); + user = find_user_event(name, &key); + mutex_unlock(&event_mutex); + + if (user) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); + return 0; + } + + index = find_first_zero_bit(page_bitmap, MAX_EVENTS); + + if (index == MAX_EVENTS) + return -EMFILE; + + user = kzalloc(sizeof(*user), GFP_KERNEL); + + if (!user) + return -ENOMEM; + + INIT_LIST_HEAD(&user->class.fields); + INIT_LIST_HEAD(&user->fields); + INIT_LIST_HEAD(&user->validators); + + user->tracepoint.name = name; + + user_event_parse_flags(user, flags); + + ret = user_event_parse_fields(user, args); + + if (ret) + goto put_user; + + ret = user_event_create_print_fmt(user); + + if (ret) + goto put_user; + + user->call.data = user; + user->call.class = &user->class; + user->call.name = name; + user->call.flags = TRACE_EVENT_FL_TRACEPOINT; + user->call.tp = &user->tracepoint; + user->call.event.funcs = &user_event_funcs; + + user->class.system = USER_EVENTS_SYSTEM; + user->class.fields_array = user_event_fields_array; + user->class.get_fields = user_event_get_fields; + user->class.reg = user_event_reg; + user->class.probe = user_event_ftrace; +#ifdef CONFIG_PERF_EVENTS + user->class.perf_probe = user_event_perf; +#endif + + mutex_lock(&event_mutex); + ret = user_event_trace_register(user); + mutex_unlock(&event_mutex); + + if (ret) + goto put_user; + + user->index = index; + + /* Ensure we track ref */ + atomic_inc(&user->refcnt); + + dyn_event_init(&user->devent, &user_event_dops); + dyn_event_add(&user->devent, &user->call); + set_bit(user->index, page_bitmap); + hash_add(register_table, &user->node, key); + + *newuser = user; + return 0; +put_user: + user_event_destroy_fields(user); + user_event_destroy_validators(user); + kfree(user); + return ret; +} + +/* + * Deletes a previously created event if it is no longer being used. + */ +static int delete_user_event(char *name) +{ + u32 key; + int ret; + struct user_event *user = find_user_event(name, &key); + + if (!user) + return -ENOENT; + + /* Ensure we are the last ref */ + if (atomic_read(&user->refcnt) != 1) { + ret = -EBUSY; + goto put_ref; + } + + ret = destroy_user_event(user); + + if (ret) + goto put_ref; + + return ret; +put_ref: + /* No longer have this ref */ + atomic_dec(&user->refcnt); + + return ret; +} + +/* + * Validates the user payload and writes via iterator. + */ +static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) +{ + struct user_event_refs *refs; + struct user_event *user = NULL; + struct tracepoint *tp; + ssize_t ret = i->count; + int idx; + + if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx))) + return -EFAULT; + + rcu_read_lock_sched(); + + refs = rcu_dereference_sched(file->private_data); + + /* + * The refs->events array is protected by RCU, and new items may be + * added. But the user retrieved from indexing into the events array + * shall be immutable while the file is opened. + */ + if (likely(refs && idx < refs->count)) + user = refs->events[idx]; + + rcu_read_unlock_sched(); + + if (unlikely(user == NULL)) + return -ENOENT; + + if (unlikely(i->count < user->min_size)) + return -EINVAL; + + tp = &user->tracepoint; + + /* + * It's possible key.enabled disables after this check, however + * we don't mind if a few events are included in this condition. + */ + if (likely(atomic_read(&tp->key.enabled) > 0)) { + struct tracepoint_func *probe_func_ptr; + user_event_func_t probe_func; + struct iov_iter copy; + void *tpdata; + bool faulted; + + if (unlikely(fault_in_iov_iter_readable(i, i->count))) + return -EFAULT; + + faulted = false; + + rcu_read_lock_sched(); + + probe_func_ptr = rcu_dereference_sched(tp->funcs); + + if (probe_func_ptr) { + do { + copy = *i; + probe_func = probe_func_ptr->func; + tpdata = probe_func_ptr->data; + probe_func(user, ©, tpdata, &faulted); + } while ((++probe_func_ptr)->func); + } + + rcu_read_unlock_sched(); + + if (unlikely(faulted)) + return -EFAULT; + } + + return ret; +} + +static ssize_t user_events_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct iovec iov; + struct iov_iter i; + + if (unlikely(*ppos != 0)) + return -EFAULT; + + if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) + return -EFAULT; + + return user_events_write_core(file, &i); +} + +static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) +{ + return user_events_write_core(kp->ki_filp, i); +} + +static int user_events_ref_add(struct file *file, struct user_event *user) +{ + struct user_event_refs *refs, *new_refs; + int i, size, count = 0; + + refs = rcu_dereference_protected(file->private_data, + lockdep_is_held(®_mutex)); + + if (refs) { + count = refs->count; + + for (i = 0; i < count; ++i) + if (refs->events[i] == user) + return i; + } + + size = struct_size(refs, events, count + 1); + + new_refs = kzalloc(size, GFP_KERNEL); + + if (!new_refs) + return -ENOMEM; + + new_refs->count = count + 1; + + for (i = 0; i < count; ++i) + new_refs->events[i] = refs->events[i]; + + new_refs->events[i] = user; + + atomic_inc(&user->refcnt); + + rcu_assign_pointer(file->private_data, new_refs); + + if (refs) + kfree_rcu(refs, rcu); + + return i; +} + +static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) +{ + u32 size; + long ret; + + ret = get_user(size, &ureg->size); + + if (ret) + return ret; + + if (size > PAGE_SIZE) + return -E2BIG; + + return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); +} + +/* + * Registers a user_event on behalf of a user process. + */ +static long user_events_ioctl_reg(struct file *file, unsigned long uarg) +{ + struct user_reg __user *ureg = (struct user_reg __user *)uarg; + struct user_reg reg; + struct user_event *user; + char *name; + long ret; + + ret = user_reg_get(ureg, ®); + + if (ret) + return ret; + + name = strndup_user((const char __user *)(uintptr_t)reg.name_args, + MAX_EVENT_DESC); + + if (IS_ERR(name)) { + ret = PTR_ERR(name); + return ret; + } + + ret = user_event_parse_cmd(name, &user); + + if (ret) { + kfree(name); + return ret; + } + + ret = user_events_ref_add(file, user); + + /* No longer need parse ref, ref_add either worked or not */ + atomic_dec(&user->refcnt); + + /* Positive number is index and valid */ + if (ret < 0) + return ret; + + put_user((u32)ret, &ureg->write_index); + put_user(user->index, &ureg->status_index); + + return 0; +} + +/* + * Deletes a user_event on behalf of a user process. + */ +static long user_events_ioctl_del(struct file *file, unsigned long uarg) +{ + void __user *ubuf = (void __user *)uarg; + char *name; + long ret; + + name = strndup_user(ubuf, MAX_EVENT_DESC); + + if (IS_ERR(name)) + return PTR_ERR(name); + + /* event_mutex prevents dyn_event from racing */ + mutex_lock(&event_mutex); + ret = delete_user_event(name); + mutex_unlock(&event_mutex); + + kfree(name); + + return ret; +} + +/* + * Handles the ioctl from user mode to register or alter operations. + */ +static long user_events_ioctl(struct file *file, unsigned int cmd, + unsigned long uarg) +{ + long ret = -ENOTTY; + + switch (cmd) { + case DIAG_IOCSREG: + mutex_lock(®_mutex); + ret = user_events_ioctl_reg(file, uarg); + mutex_unlock(®_mutex); + break; + + case DIAG_IOCSDEL: + mutex_lock(®_mutex); + ret = user_events_ioctl_del(file, uarg); + mutex_unlock(®_mutex); + break; + } + + return ret; +} + +/* + * Handles the final close of the file from user mode. + */ +static int user_events_release(struct inode *node, struct file *file) +{ + struct user_event_refs *refs; + struct user_event *user; + int i; + + /* + * Ensure refs cannot change under any situation by taking the + * register mutex during the final freeing of the references. + */ + mutex_lock(®_mutex); + + refs = file->private_data; + + if (!refs) + goto out; + + /* + * The lifetime of refs has reached an end, it's tied to this file. + * The underlying user_events are ref counted, and cannot be freed. + * After this decrement, the user_events may be freed elsewhere. + */ + for (i = 0; i < refs->count; ++i) { + user = refs->events[i]; + + if (user) + atomic_dec(&user->refcnt); + } +out: + file->private_data = NULL; + + mutex_unlock(®_mutex); + + kfree(refs); + + return 0; +} + +static const struct file_operations user_data_fops = { + .write = user_events_write, + .write_iter = user_events_write_iter, + .unlocked_ioctl = user_events_ioctl, + .release = user_events_release, +}; + +/* + * Maps the shared page into the user process for checking if event is enabled. + */ +static int user_status_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long size = vma->vm_end - vma->vm_start; + + if (size != MAX_EVENTS) + return -EINVAL; + + return remap_pfn_range(vma, vma->vm_start, + virt_to_phys(register_page_data) >> PAGE_SHIFT, + size, vm_get_page_prot(VM_READ)); +} + +static void *user_seq_start(struct seq_file *m, loff_t *pos) +{ + if (*pos) + return NULL; + + return (void *)1; +} + +static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos) +{ + ++*pos; + return NULL; +} + +static void user_seq_stop(struct seq_file *m, void *p) +{ +} + +static int user_seq_show(struct seq_file *m, void *p) +{ + struct user_event *user; + char status; + int i, active = 0, busy = 0, flags; + + mutex_lock(®_mutex); + + hash_for_each(register_table, i, user, node) { + status = register_page_data[user->index]; + flags = user->flags; + + seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); + + if (flags != 0 || status != 0) + seq_puts(m, " #"); + + if (status != 0) { + seq_puts(m, " Used by"); + if (status & EVENT_STATUS_FTRACE) + seq_puts(m, " ftrace"); + if (status & EVENT_STATUS_PERF) + seq_puts(m, " perf"); + if (status & EVENT_STATUS_OTHER) + seq_puts(m, " other"); + busy++; + } + + if (flags & FLAG_BPF_ITER) + seq_puts(m, " FLAG:BPF_ITER"); + + seq_puts(m, "\n"); + active++; + } + + mutex_unlock(®_mutex); + + seq_puts(m, "\n"); + seq_printf(m, "Active: %d\n", active); + seq_printf(m, "Busy: %d\n", busy); + seq_printf(m, "Max: %ld\n", MAX_EVENTS); + + return 0; +} + +static const struct seq_operations user_seq_ops = { + .start = user_seq_start, + .next = user_seq_next, + .stop = user_seq_stop, + .show = user_seq_show, +}; + +static int user_status_open(struct inode *node, struct file *file) +{ + return seq_open(file, &user_seq_ops); +} + +static const struct file_operations user_status_fops = { + .open = user_status_open, + .mmap = user_status_mmap, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * Creates a set of tracefs files to allow user mode interactions. + */ +static int create_user_tracefs(void) +{ + struct dentry *edata, *emmap; + + edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE, + NULL, NULL, &user_data_fops); + + if (!edata) { + pr_warn("Could not create tracefs 'user_events_data' entry\n"); + goto err; + } + + /* mmap with MAP_SHARED requires writable fd */ + emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE, + NULL, NULL, &user_status_fops); + + if (!emmap) { + tracefs_remove(edata); + pr_warn("Could not create tracefs 'user_events_mmap' entry\n"); + goto err; + } + + return 0; +err: + return -ENODEV; +} + +static void set_page_reservations(bool set) +{ + int page; + + for (page = 0; page < MAX_PAGES; ++page) { + void *addr = register_page_data + (PAGE_SIZE * page); + + if (set) + SetPageReserved(virt_to_page(addr)); + else + ClearPageReserved(virt_to_page(addr)); + } +} + +static int __init trace_events_user_init(void) +{ + struct page *pages; + int ret; + + /* Zero all bits beside 0 (which is reserved for failures) */ + bitmap_zero(page_bitmap, MAX_EVENTS); + set_bit(0, page_bitmap); + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); + if (!pages) + return -ENOMEM; + register_page_data = page_address(pages); + + set_page_reservations(true); + + ret = create_user_tracefs(); + + if (ret) { + pr_warn("user_events could not register with tracefs\n"); + set_page_reservations(false); + __free_pages(pages, MAX_PAGE_ORDER); + return ret; + } + + if (dyn_event_register(&user_event_dops)) + pr_warn("user_events could not register with dyn_events\n"); + + return 0; +} + +fs_initcall(trace_events_user_init); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 4e1257f50aa3..b62fd785b599 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -32,7 +32,7 @@ static int __init set_kprobe_boot_events(char *str) strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); - return 0; + return 1; } __setup("kprobe_event=", set_kprobe_boot_events); @@ -328,11 +328,9 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) static void __disable_trace_kprobe(struct trace_probe *tp) { - struct trace_probe *pos; struct trace_kprobe *tk; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tk = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) { if (!trace_kprobe_is_registered(tk)) continue; if (trace_kprobe_is_return(tk)) @@ -349,7 +347,7 @@ static void __disable_trace_kprobe(struct trace_probe *tp) static int enable_trace_kprobe(struct trace_event_call *call, struct trace_event_file *file) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_kprobe *tk; bool enabled; int ret = 0; @@ -370,8 +368,7 @@ static int enable_trace_kprobe(struct trace_event_call *call, if (enabled) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tk = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) { if (trace_kprobe_has_gone(tk)) continue; ret = __enable_trace_kprobe(tk); @@ -560,11 +557,9 @@ static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, struct trace_kprobe *comp) { struct trace_probe_event *tpe = orig->tp.event; - struct trace_probe *pos; int i; - list_for_each_entry(pos, &tpe->probes, list) { - orig = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(orig, &tpe->probes, tp.list) { if (strcmp(trace_kprobe_symbol(orig), trace_kprobe_symbol(comp)) || trace_kprobe_offset(orig) != trace_kprobe_offset(comp)) @@ -1176,15 +1171,18 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct dyn_event *ev = v; struct trace_kprobe *tk; + unsigned long nmissed; if (!is_trace_kprobe(ev)) return 0; tk = to_trace_kprobe(ev); + nmissed = trace_kprobe_is_return(tk) ? + tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; seq_printf(m, " %-44s %15lu %15lu\n", trace_probe_name(&tk->tp), trace_kprobe_nhit(tk), - tk->rp.kp.nmissed); + nmissed); return 0; } @@ -1384,17 +1382,11 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, if (trace_trigger_soft_disabled(trace_file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = trace_file; - dsize = __get_data_size(&tk->tp, regs); - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, - call->event.type, - sizeof(*entry) + tk->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tk->tp.size + dsize); + if (!entry) return; fbuffer.regs = regs; @@ -1431,16 +1423,11 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, if (trace_trigger_soft_disabled(trace_file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = trace_file; - dsize = __get_data_size(&tk->tp, regs); - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, - call->event.type, - sizeof(*entry) + tk->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tk->tp.size + dsize); + if (!entry) return; fbuffer.regs = regs; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 89d6cbac6f10..e9ae1f33a7f0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -138,8 +138,7 @@ static void osnoise_unregister_instance(struct trace_array *tr) if (!found) return; - synchronize_rcu(); - kfree(inst); + kvfree_rcu(inst); } /* @@ -1168,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) * used to record the beginning and to report the end of a thread noise window. */ static void -trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, +trace_sched_switch_callback(void *data, bool preempt, + unsigned int prev_state, + struct task_struct *p, struct task_struct *n) { struct osnoise_variables *osn_var = this_cpu_osn_var(); @@ -1388,6 +1389,26 @@ static int run_osnoise(void) } /* + * In some cases, notably when running on a nohz_full CPU with + * a stopped tick PREEMPT_RCU has no way to account for QSs. + * This will eventually cause unwarranted noise as PREEMPT_RCU + * will force preemption as the means of ending the current + * grace period. We avoid this problem by calling + * rcu_momentary_dyntick_idle(), which performs a zero duration + * EQS allowing PREEMPT_RCU to end the current grace period. + * This call shouldn't be wrapped inside an RCU critical + * section. + * + * Note that in non PREEMPT_RCU kernels QSs are handled through + * cond_resched() + */ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + local_irq_disable(); + rcu_momentary_dyntick_idle(); + local_irq_enable(); + } + + /* * For the non-preemptive kernel config: let threads runs, if * they so wish. */ @@ -1438,6 +1459,37 @@ static struct cpumask osnoise_cpumask; static struct cpumask save_cpumask; /* + * osnoise_sleep - sleep until the next period + */ +static void osnoise_sleep(void) +{ + u64 interval; + ktime_t wake_time; + + mutex_lock(&interface_lock); + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + mutex_unlock(&interface_lock); + + /* + * differently from hwlat_detector, the osnoise tracer can run + * without a pause because preemption is on. + */ + if (!interval) { + /* Let synchronize_rcu_tasks() make progress */ + cond_resched_tasks_rcu_qs(); + return; + } + + wake_time = ktime_add_us(ktime_get(), interval); + __set_current_state(TASK_INTERRUPTIBLE); + + while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) { + if (kthread_should_stop()) + break; + } +} + +/* * osnoise_main - The osnoise detection kernel thread * * Calls run_osnoise() function to measure the osnoise for the configured runtime, @@ -1445,30 +1497,10 @@ static struct cpumask save_cpumask; */ static int osnoise_main(void *data) { - u64 interval; while (!kthread_should_stop()) { - run_osnoise(); - - mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; - mutex_unlock(&interface_lock); - - do_div(interval, USEC_PER_MSEC); - - /* - * differently from hwlat_detector, the osnoise tracer can run - * without a pause because preemption is on. - */ - if (interval < 1) { - /* Let synchronize_rcu_tasks() make progress */ - cond_resched_tasks_rcu_qs(); - continue; - } - - if (msleep_interruptible(interval)) - break; + osnoise_sleep(); } return 0; @@ -2122,6 +2154,13 @@ out_unhook_irq: return -EINVAL; } +static void osnoise_unhook_events(void) +{ + unhook_thread_events(); + unhook_softirq_events(); + unhook_irq_events(); +} + /* * osnoise_workload_start - start the workload and hook to events */ @@ -2154,7 +2193,14 @@ static int osnoise_workload_start(void) retval = start_per_cpu_kthreads(); if (retval) { - unhook_irq_events(); + trace_osnoise_callback_enabled = false; + /* + * Make sure that ftrace_nmi_enter/exit() see + * trace_osnoise_callback_enabled as false before continuing. + */ + barrier(); + + osnoise_unhook_events(); return retval; } @@ -2176,6 +2222,17 @@ static void osnoise_workload_stop(void) if (osnoise_has_registered_instances()) return; + /* + * If callbacks were already disabled in a previous stop + * call, there is no need to disable then again. + * + * For instance, this happens when tracing is stopped via: + * echo 0 > tracing_on + * echo nop > current_tracer. + */ + if (!trace_osnoise_callback_enabled) + return; + trace_osnoise_callback_enabled = false; /* * Make sure that ftrace_nmi_enter/exit() see @@ -2185,9 +2242,7 @@ static void osnoise_workload_stop(void) stop_per_cpu_kthreads(); - unhook_irq_events(); - unhook_softirq_events(); - unhook_thread_events(); + osnoise_unhook_events(); } static void osnoise_tracer_start(struct trace_array *tr) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3547e7176ff7..8aa493d25c73 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -445,14 +445,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; + int bh_off; int nmi; nmi = entry->flags & TRACE_FLAG_NMI; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + bh_off = entry->flags & TRACE_FLAG_BH_OFF; irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + bh_off ? 'b' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 3ed2a3f37297..80863c6508e5 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -356,6 +356,8 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) return -EINVAL; } *pbuf = kstrndup(str, len - 1, GFP_KERNEL); + if (!*pbuf) + return -ENOMEM; return 0; } @@ -869,15 +871,15 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, switch (ptype) { case PROBE_PRINT_NORMAL: fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; + arg = ", REC->" FIELD_STRING_IP; break; case PROBE_PRINT_RETURN: fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + arg = ", REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; break; case PROBE_PRINT_EVENT: - fmt = "(%u)"; - arg = "REC->" FIELD_STRING_TYPE; + fmt = ""; + arg = ""; break; default: WARN_ON_ONCE(1); @@ -901,7 +903,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, parg->type->fmt); } - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", arg); for (i = 0; i < tp->nr_args; i++) { parg = tp->args + i; @@ -1138,8 +1140,7 @@ int trace_probe_remove_file(struct trace_probe *tp, return -ENOENT; list_del_rcu(&link->list); - synchronize_rcu(); - kfree(link); + kvfree_rcu(link); if (list_empty(&tp->event->files)) trace_probe_clear_flag(tp, TP_FLAG_TRACE); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 99e7a5df025e..92cc149af0fd 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,7 +38,6 @@ #define FIELD_STRING_IP "__probe_ip" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" -#define FIELD_STRING_TYPE "__probe_type" #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index e304196d7c28..45796d8bd4b2 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, + unsigned int prev_state, struct task_struct *prev, struct task_struct *next) { int flags; @@ -44,7 +45,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee) if (!flags) return; - tracing_record_taskinfo(current, flags); + tracing_record_taskinfo_sched_switch(current, wakee, flags); } static int tracing_sched_register(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 2402de520eca..46429f9a96fa 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, static void notrace probe_wakeup_sched_switch(void *ignore, bool preempt, + unsigned int prev_state, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index afd937a46496..abcadbe933bb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -784,9 +784,7 @@ static struct fgraph_ops fgraph_ops __initdata = { .retfunc = &trace_graph_return, }; -#if defined(CONFIG_DYNAMIC_FTRACE) && \ - defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) -#define TEST_DIRECT_TRAMP +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS noinline __noclone static void trace_direct_tramp(void) { } #endif @@ -849,7 +847,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, goto out; } -#ifdef TEST_DIRECT_TRAMP +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8bfcd3b09422..f755bde42fd0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -323,8 +323,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, sys_data->enter_event->event.type, size, trace_ctx); if (!event) return; @@ -367,8 +366,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, sys_data->exit_event->event.type, sizeof(*entry), trace_ctx); if (!event) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4f35514a48f3..9711589273cd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -410,12 +410,10 @@ static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, struct trace_uprobe *comp) { struct trace_probe_event *tpe = orig->tp.event; - struct trace_probe *pos; struct inode *comp_inode = d_real_inode(comp->path.dentry); int i; - list_for_each_entry(pos, &tpe->probes, list) { - orig = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(orig, &tpe->probes, tp.list) { if (comp_inode != d_real_inode(orig->path.dentry) || comp->offset != orig->offset) continue; @@ -950,8 +948,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; - struct trace_buffer *buffer; - struct ring_buffer_event *event; + struct trace_event_buffer fbuffer; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); @@ -966,12 +963,10 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - call->event.type, size, 0); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); + if (!entry) return; - entry = ring_buffer_event_data(event); if (is_ret_probe(tu)) { entry->vaddr[0] = func; entry->vaddr[1] = instruction_pointer(regs); @@ -983,7 +978,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - event_trigger_unlock_commit(trace_file, buffer, event, entry, 0); + trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ @@ -1076,14 +1071,12 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) static void __probe_event_disable(struct trace_probe *tp) { - struct trace_probe *pos; struct trace_uprobe *tu; tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { if (!tu->inode) continue; @@ -1095,7 +1088,7 @@ static void __probe_event_disable(struct trace_probe *tp) static int probe_event_enable(struct trace_event_call *call, struct trace_event_file *file, filter_func_t filter) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; bool enabled; int ret; @@ -1130,8 +1123,7 @@ static int probe_event_enable(struct trace_event_call *call, if (ret) goto err_flags; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { ret = trace_uprobe_enable(tu, filter); if (ret) { __probe_event_disable(tp); @@ -1276,7 +1268,7 @@ static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter, static int uprobe_perf_close(struct trace_event_call *call, struct perf_event *event) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; int ret = 0; @@ -1288,8 +1280,7 @@ static int uprobe_perf_close(struct trace_event_call *call, if (trace_uprobe_filter_remove(tu->tp.event->filter, event)) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); if (ret) break; @@ -1301,7 +1292,7 @@ static int uprobe_perf_close(struct trace_event_call *call, static int uprobe_perf_open(struct trace_event_call *call, struct perf_event *event) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; int err = 0; @@ -1313,8 +1304,7 @@ static int uprobe_perf_open(struct trace_event_call *call, if (trace_uprobe_filter_add(tu->tp.event->filter, event)) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); @@ -1620,6 +1610,11 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->path = path; tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); + if (!tu->filename) { + ret = -ENOMEM; + goto error; + } + init_trace_event_call(tu); ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index f00de83d0246..1d261fbe367b 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -38,11 +38,10 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); stats->ac_btime64 = btime; - if (thread_group_leader(tsk)) { + if (tsk->flags & PF_EXITING) stats->ac_exitcode = tsk->exit_code; - if (tsk->flags & PF_FORKNOEXEC) - stats->ac_flag |= AFORK; - } + if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC)) + stats->ac_flag |= AFORK; if (tsk->flags & PF_SUPERPRIV) stats->ac_flag |= ASU; if (tsk->flags & PF_DUMPCORE) diff --git a/kernel/ucount.c b/kernel/ucount.c index 7b32c356ebc5..06ea04d44685 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -190,6 +190,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) kfree(new); } else { hlist_add_head(&new->node, hashent); + get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } @@ -210,6 +211,7 @@ void put_ucounts(struct ucounts *ucounts) if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { hlist_del_init(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); + put_user_ns(ucounts->ns); kfree(ucounts); } } @@ -348,7 +350,8 @@ bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsign if (rlimit > LONG_MAX) max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - if (get_ucounts_value(iter, type) > max) + long val = get_ucounts_value(iter, type); + if (val < 0 || val > max) return true; max = READ_ONCE(iter->ns->ucount_max[type]); } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 6b2e3ca7ee99..5481ba44a8d6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->user_ns = user_ns; } +static unsigned long enforced_nproc_rlimit(void) +{ + unsigned long limit = RLIM_INFINITY; + + /* Is RLIMIT_NPROC currently enforced? */ + if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || + (current_user_ns() != &init_user_ns)) + limit = rlimit(RLIMIT_NPROC); + + return limit; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -122,7 +134,7 @@ int create_user_ns(struct cred *new) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 9c9eb20dd2c5..3990e4df3d7b 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -54,6 +54,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, bit += page->index; set_bit(bit, wqueue->notes_bitmap); + generic_pipe_buf_release(pipe, buf); } // No try_steal function => no stealing @@ -112,7 +113,7 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; - pipe->head = head + 1; + smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); @@ -219,7 +220,6 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) struct page **pages; unsigned long *bitmap; unsigned long user_bufs; - unsigned int bmsize; int ret, i, nr_pages; if (!wqueue) @@ -243,7 +243,8 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; } - ret = pipe_resize_ring(pipe, nr_notes); + nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; @@ -258,21 +259,19 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; } - bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG; - bmsize *= sizeof(unsigned long); - bitmap = kmalloc(bmsize, GFP_KERNEL); + bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); if (!bitmap) goto error_p; - memset(bitmap, 0xff, bmsize); + bitmap_fill(bitmap, nr_notes); wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; - wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + wqueue->nr_notes = nr_notes; return 0; error_p: - for (i = 0; i < nr_pages; i++) + while (--i >= 0) __free_page(pages[i]); kfree(pages); error: @@ -320,7 +319,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ - if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } @@ -336,7 +335,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { - if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; @@ -371,6 +370,7 @@ static void __put_watch_queue(struct kref *kref) for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); + bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); if (wfilter) @@ -395,6 +395,7 @@ static void free_watch(struct rcu_head *rcu) put_watch_queue(rcu_access_pointer(watch->queue)); atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); + kfree(watch); } static void __put_watch(struct kref *kref) @@ -566,7 +567,7 @@ void watch_queue_clear(struct watch_queue *wqueue) rcu_read_lock(); spin_lock_bh(&wqueue->lock); - /* Prevent new additions and prevent notifications from happening */ + /* Prevent new notifications from being stored. */ wqueue->defunct = true; while (!hlist_empty(&wqueue->watches)) { diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ad912511a0c0..9166220457bc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -740,6 +740,106 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_unlock(&watchdog_mutex); return err; } + +static const int sixty = 60; + +static struct ctl_table watchdog_sysctls[] = { + { + .procname = "watchdog", + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_thresh", + .data = &watchdog_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog_thresh, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&sixty, + }, + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = NMI_WATCHDOG_SYSCTL_PERM, + .proc_handler = proc_nmi_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + { + .procname = "soft_watchdog", + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif + {} +}; + +static void __init watchdog_sysctl_init(void) +{ + register_sysctl_init("kernel", watchdog_sysctls); +} +#else +#define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) @@ -748,9 +848,10 @@ void __init lockup_detector_init(void) pr_info("Disabling watchdog on nohz_full cores by default\n"); cpumask_copy(&watchdog_cpumask, - housekeeping_cpumask(HK_FLAG_TIMER)); + housekeeping_cpumask(HK_TYPE_TIMER)); if (!watchdog_nmi_probe()) nmi_watchdog_available = true; lockup_detector_setup(); + watchdog_sysctl_init(); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33f1106b4f99..0d2514b4ff0d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -154,15 +154,20 @@ struct worker_pool { unsigned long watchdog_ts; /* L: watchdog timestamp */ - /* The current concurrency level. */ - atomic_t nr_running; + /* + * The counter is incremented in a process context on the associated CPU + * w/ preemption disabled, and decremented or reset in the same context + * but w/ pool->lock held. The readers grab pool->lock and are + * guaranteed to see if the counter reached zero. + */ + int nr_running; struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ int nr_idle; /* L: currently idle workers */ - struct list_head idle_list; /* X: list of idle workers */ + struct list_head idle_list; /* L: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ @@ -777,7 +782,7 @@ static bool work_is_canceling(struct work_struct *work) static bool __need_more_worker(struct worker_pool *pool) { - return !atomic_read(&pool->nr_running); + return !pool->nr_running; } /* @@ -802,8 +807,7 @@ static bool may_start_working(struct worker_pool *pool) /* Do I need to keep working? Called from currently running workers. */ static bool keep_working(struct worker_pool *pool) { - return !list_empty(&pool->worklist) && - atomic_read(&pool->nr_running) <= 1; + return !list_empty(&pool->worklist) && (pool->nr_running <= 1); } /* Do we need a new worker? Called from manager. */ @@ -826,7 +830,7 @@ static bool too_many_workers(struct worker_pool *pool) * Wake up functions. */ -/* Return the first idle worker. Safe with preemption disabled */ +/* Return the first idle worker. Called with pool->lock held. */ static struct worker *first_idle_worker(struct worker_pool *pool) { if (unlikely(list_empty(&pool->idle_list))) @@ -873,7 +877,7 @@ void wq_worker_running(struct task_struct *task) */ preempt_disable(); if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(&worker->pool->nr_running); + worker->pool->nr_running++; preempt_enable(); worker->sleeping = 0; } @@ -887,7 +891,7 @@ void wq_worker_running(struct task_struct *task) */ void wq_worker_sleeping(struct task_struct *task) { - struct worker *next, *worker = kthread_data(task); + struct worker *worker = kthread_data(task); struct worker_pool *pool; /* @@ -917,23 +921,9 @@ void wq_worker_sleeping(struct task_struct *task) return; } - /* - * The counterpart of the following dec_and_test, implied mb, - * worklist not empty test sequence is in insert_work(). - * Please read comment there. - * - * NOT_RUNNING is clear. This means that we're bound to and - * running on the local cpu w/ rq lock held and preemption - * disabled, which in turn means that none else could be - * manipulating idle_list, so dereferencing idle_list without pool - * lock is safe. - */ - if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) { - next = first_idle_worker(pool); - if (next) - wake_up_process(next->task); - } + pool->nr_running--; + if (need_more_worker(pool)) + wake_up_worker(pool); raw_spin_unlock_irq(&pool->lock); } @@ -987,7 +977,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_dec(&pool->nr_running); + pool->nr_running--; } worker->flags |= flags; @@ -1019,7 +1009,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(&pool->nr_running); + pool->nr_running++; } /** @@ -1372,13 +1362,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, list_add_tail(&work->entry, head); get_pwq(pwq); - /* - * Ensure either wq_worker_sleeping() sees the above - * list_add_tail() or we see zero nr_running to avoid workers lying - * around lazily while there are works to be processed. - */ - smp_mb(); - if (__need_more_worker(pool)) wake_up_worker(pool); } @@ -1827,8 +1810,7 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* Sanity check nr_running. */ - WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && - atomic_read(&pool->nr_running)); + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); } /** @@ -5006,7 +4988,7 @@ static void unbind_workers(int cpu) * an unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ - atomic_set(&pool->nr_running, 0); + pool->nr_running = 0; /* * With concurrency management just turned off, a busy @@ -6006,13 +5988,13 @@ static void __init wq_numa_init(void) void __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; - int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |